Merge pull request #5638 from neondatabase/releases/2023-10-24

Release 2023-10-24
Cleanup in azure_upload_download_works test (#5636 )
2026-03-21 09:10:37 +00:00 · 2023-10-24 12:10:52 +03:00 · 2023-10-23 19:08:56 +01:00 · 2023-10-23 17:30:25 +01:00 · 2023-10-23 17:51:38 +02:00 · 2023-10-23 15:32:34 +01:00
201 changed files with 14932 additions and 6536 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -76,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.23.1
-        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
+        ALLURE_VERSION: 2.24.0
+        ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -320,6 +320,9 @@ jobs:
      - name: Build neon extensions
        run: mold -run make neon-pg-ext -j$(nproc)

+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
      - name: Run cargo build
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
@@ -834,7 +837,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.12
+      VM_BUILDER_VERSION: v0.18.1

    steps:
      - name: Checkout
@@ -1092,8 +1095,10 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+
+            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -32,7 +32,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -90,18 +90,21 @@ jobs:

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: make postgres-v14 -j$(nproc)
+        run: make postgres-v14 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v15
        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: make postgres-v15 -j$(nproc)
+        run: make postgres-v15 -j$(sysctl -n hw.ncpu)

      - name: Build postgres v16
        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: make postgres-v16 -j$(nproc)
+        run: make postgres-v16 -j$(sysctl -n hw.ncpu)

      - name: Build neon extensions
-        run: make neon-pg-ext -j$(nproc)
+        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
+
+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(sysctl -n hw.ncpu)

      - name: Run cargo build
        run: cargo build --all --release
@@ -126,7 +129,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 1
@@ -135,6 +138,9 @@ jobs:
      - name: Get postgres headers
        run: make postgres-headers -j$(nproc)

+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(nproc)
+
      - name: Produce the build stats
        run: cargo build --all --release --timings

--- a/2
+++ b/2
@@ -5,7 +5,7 @@
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
    "libs/tracing-utils",
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
+    "libs/walproposer",
 ]

 [workspace.package]
@@ -36,6 +37,10 @@ license = "Apache-2.0"
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
+azure_core = "0.16"
+azure_identity = "0.16"
+azure_storage = "0.16"
+azure_storage_blobs = "0.16"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -51,6 +56,7 @@ bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
+camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
@@ -75,6 +81,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
+http-types = "2"
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
@@ -154,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -179,6 +186,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
+walproposer = { version = "0.1", path = "./libs/walproposer/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
@@ -187,14 +195,14 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 criterion = "0.5.1"
 rcgen = "0.11"
 rstest = "0.18"
-tempfile = "3.4"
+camino-tempfile = "1.0.2"
 tonic-build = "0.9"

 [patch.crates-io]

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -224,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
-    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -368,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
+    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -615,11 +615,7 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 #########################################################################################
 #
 # Layer "rust extensions"
-# This layer is used to build `pgx` deps
-#
-# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
-# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
-# dependency on all the rust extension that depend on it, too.
+# This layer is used to build `pgrx` deps
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
@@ -635,22 +631,12 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
-        ;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
-    cargo install --locked --version 0.7.3 cargo-pgx && \
-    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+    cargo install --locked --version 0.10.2 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root

@@ -664,23 +650,11 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION

-# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
-# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control

 #########################################################################################
@@ -693,26 +667,11 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION

-# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
-# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
-# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
-# same 1.1 version we've used before.
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
-    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
+    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    # it's needed to enable extension because it uses untrusted C language
    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -727,21 +686,11 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION

-# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
+# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    cargo pgx install --release && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

 #########################################################################################
@@ -754,21 +703,15 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
-    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
+    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
+    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    echo "********************************************************************************************************" && \
+    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

 #########################################################################################
--- a/38
+++ b/38
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers
+neon: postgres-headers walproposer-lib
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

@@ -168,6 +168,42 @@ neon-pg-ext-clean-%:
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean

+# Build walproposer as a static library. walproposer source code is located
+# in the pgxn/neon directory.
+# 
+# We also need to include libpgport.a and libpgcommon.a, because walproposer
+# uses some functions from those libraries.
+# 
+# Some object files are removed from libpgport.a and libpgcommon.a because
+# they depend on openssl and other libraries that are not included in our
+# Rust build.
+.PHONY: walproposer-lib
+walproposer-lib: neon-pg-ext-v16
+	+@echo "Compiling walproposer-lib"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+ifeq ($(UNAME_S),Linux)
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
+		pg_strong_random.o
+	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+		pg_crc32c.o \
+		hmac_openssl.o \
+		cryptohash_openssl.o \
+		scram-common.o \
+		md5_common.o \
+		checksum_helper.o
+endif
+
+.PHONY: walproposer-lib-clean
+walproposer-lib-clean:
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
+		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+
 .PHONY: neon-pg-ext
 neon-pg-ext: \
 	neon-pg-ext-v14 \
--- a/4
+++ b/4
@@ -1,5 +1,5 @@
 Neon
 Copyright 2022 Neon Inc.

-The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
-PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
+The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
+See vendor/postgres-vX/COPYRIGHT for details.
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -692,10 +692,11 @@ impl ComputeNode {
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        create_neon_superuser(spec, &mut client)?;
+        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, self.connstr.as_str())?;
+        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        create_availability_check_data(&mut client)?;

@@ -731,10 +732,11 @@ impl ComputeNode {
        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
            client.simple_query("SET neon.forward_ddl = false")?;
+            cleanup_instance(&mut client)?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(&spec, self.connstr.as_str())?;
+            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
        }

@@ -1039,7 +1041,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,5 +1,5 @@
 use std::sync::Arc;
-use std::{thread, time};
+use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
@@ -7,7 +7,7 @@ use tracing::{debug, info};

 use crate::compute::ComputeNode;

-const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
+const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
@@ -17,13 +17,12 @@ fn watch_compute_activity(compute: &ComputeNode) {
    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(connstr, NoTls);
-    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);

    info!("watching Postgres activity at {}", connstr);

    loop {
        // Should be outside of the write lock to allow others to read while we sleep.
-        thread::sleep(timeout);
+        thread::sleep(MONITOR_CHECK_INTERVAL);

        match &mut client {
            Ok(cli) => {
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::fmt::Write;
 use std::fs;
 use std::fs::File;
@@ -205,22 +206,37 @@ pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
 }

 /// Build a list of existing Postgres databases
-pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
-    let postgres_dbs = client
+pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
+    // `pg_database.datconnlimit = -2` means that the database is in the
+    // invalid state. See:
+    //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+    let postgres_dbs: Vec<Database> = client
        .query(
-            "SELECT datname, datdba::regrole::text as owner
-               FROM pg_catalog.pg_database;",
+            "SELECT
+                datname AS name,
+                datdba::regrole::text AS owner,
+                NOT datallowconn AS restrict_conn,
+                datconnlimit = - 2 AS invalid
+            FROM
+                pg_catalog.pg_database;",
            &[],
        )?
        .iter()
        .map(|row| Database {
-            name: row.get("datname"),
+            name: row.get("name"),
            owner: row.get("owner"),
+            restrict_conn: row.get("restrict_conn"),
+            invalid: row.get("invalid"),
            options: None,
        })
        .collect();

-    Ok(postgres_dbs)
+    let dbs_map = postgres_dbs
+        .iter()
+        .map(|db| (db.name.clone(), db.clone()))
+        .collect::<HashMap<_, _>>();
+
+    Ok(dbs_map)
 }

 /// Wait for Postgres to become ready to accept connections. It's ready to
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -13,7 +13,7 @@ use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

 use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
-use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
+use compute_api::spec::{ComputeSpec, PgIdent, Role};

 // Do control plane request and return response if any. In case of error it
 // returns a bool flag indicating whether it makes sense to retry the request
@@ -161,6 +161,38 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

+/// Compute could be unexpectedly shut down, for example, during the
+/// database dropping. This leaves the database in the invalid state,
+/// which prevents new db creation with the same name. This function
+/// will clean it up before proceeding with catalog updates. All
+/// possible future cleanup operations may go here too.
+#[instrument(skip_all)]
+pub fn cleanup_instance(client: &mut Client) -> Result<()> {
+    let existing_dbs = get_existing_dbs(client)?;
+
+    for (_, db) in existing_dbs {
+        if db.invalid {
+            // After recent commit in Postgres, interrupted DROP DATABASE
+            // leaves the database in the invalid state. According to the
+            // commit message, the only option for user is to drop it again.
+            // See:
+            //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
+            //
+            // Postgres Neon extension is done the way, that db is de-registered
+            // in the control plane metadata only after it is dropped. So there is
+            // a chance that it still thinks that db should exist. This means
+            // that it will be re-created by `handle_databases()`. Yet, it's fine
+            // as user can just repeat drop (in vanilla Postgres they would need
+            // to do the same, btw).
+            let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
+            info!("dropping invalid database {}", db.name);
+            client.execute(query.as_str(), &[])?;
+        }
+    }
+
+    Ok(())
+}
+
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
 #[instrument(skip_all)]
@@ -270,7 +302,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -379,13 +411,13 @@ fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent
 /// which together provide us idempotency.
 #[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
-    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+    let existing_dbs = get_existing_dbs(client)?;

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
        info!("postgres databases:");
-        for r in &existing_dbs {
-            info!("    {}:{}", r.name, r.owner);
+        for (dbname, db) in &existing_dbs {
+            info!("    {}:{}", dbname, db.owner);
        }
    }

@@ -439,8 +471,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();

-                    // XXX: with a limited number of roles it is fine, but consider making it a HashMap
-                    if existing_dbs.iter().any(|r| r.name == op.name) {
+                    if existing_dbs.get(&op.name).is_some() {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
@@ -457,14 +488,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    }

    // Refresh Postgres databases info to handle possible renames
-    let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
+    let existing_dbs = get_existing_dbs(client)?;

    info!("cluster spec databases:");
    for db in &spec.cluster.databases {
        let name = &db.name;
-
-        // XXX: with a limited number of databases it is fine, but consider making it a HashMap
-        let pg_db = existing_dbs.iter().find(|r| r.name == *name);
+        let pg_db = existing_dbs.get(name);

        enum DatabaseAction {
            None,
@@ -530,13 +559,32 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
-    info!("cluster spec grants:");
+pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
+    info!("modifying database permissions");
+    let existing_dbs = get_existing_dbs(client)?;

    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
    for db in &spec.cluster.databases {
+        match existing_dbs.get(&db.name) {
+            Some(pg_db) => {
+                if pg_db.restrict_conn || pg_db.invalid {
+                    info!(
+                        "skipping grants for db {} (invalid: {}, connections not allowed: {})",
+                        db.name, pg_db.invalid, pg_db.restrict_conn
+                    );
+                    continue;
+                }
+            }
+            None => {
+                bail!(
+                    "database {} doesn't exist in Postgres after handle_databases()",
+                    db.name
+                );
+            }
+        }
+
        let mut conf = Config::from_str(connstr)?;
        conf.dbname(&db.name);

@@ -575,6 +623,11 @@ pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {

        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
        // This is needed because since postgres 15 this privilege is removed by default.
+        // TODO: web_access isn't created for almost 1 year. It could be that we have
+        // active users of 1 year old projects, but hopefully not, so check it and
+        // remove this code if possible. The worst thing that could happen is that
+        // user won't be able to use public schema in NEW databases created in the
+        // very OLD project.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
            r#"fsync = off
-wal_level = replica
+wal_level = logical
 hot_standby = on
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 git-version.workspace = true
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,5 +1,6 @@
 use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
+use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
@@ -47,8 +48,9 @@ impl AttachmentService {
        }
    }

-    fn pid_file(&self) -> PathBuf {
-        self.env.base_data_dir.join("attachment_service.pid")
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self) -> anyhow::Result<Child> {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -16,12 +16,13 @@ use std::ffi::OsStr;
 use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::process::{Child, Command};
 use std::time::Duration;
 use std::{fs, io, thread};

 use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 use nix::errno::Errno;
 use nix::fcntl::{FcntlArg, FdFlag};
 use nix::sys::signal::{kill, Signal};
@@ -45,9 +46,9 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
 /// it itself.
 pub enum InitialPidFile<'t> {
    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(&'t Path),
+    Create(&'t Utf8Path),
    /// The process will create the pidfile itself, need to wait for that event.
-    Expect(&'t Path),
+    Expect(&'t Utf8Path),
 }

 /// Start a background child process using the parameters given.
@@ -85,7 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
+    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match initial_pid_file {
@@ -137,7 +138,11 @@ where
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
-pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
+pub fn stop_process(
+    immediate: bool,
+    process_name: &str,
+    pid_file: &Utf8Path,
+) -> anyhow::Result<()> {
    let pid = match pid_file::read(pid_file)
        .with_context(|| format!("read pid_file {pid_file:?}"))?
    {
@@ -233,11 +238,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    filled_cmd
 }

-fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
        "AWS_SESSION_TOKEN",
+        "AZURE_STORAGE_ACCOUNT",
+        "AZURE_STORAGE_ACCESS_KEY",
    ] {
        if let Ok(value) = std::env::var(env_key) {
            cmd = cmd.env(env_key, value);
@@ -252,9 +259,9 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
 ///    will remain held until the cmd exits.
 fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
 where
-    P: Into<PathBuf>,
+    P: Into<Utf8PathBuf>,
 {
-    let path: PathBuf = path.into();
+    let path: Utf8PathBuf = path.into();
    // SAFETY
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
@@ -311,7 +318,7 @@ where

 fn process_started<F>(
    pid: Pid,
-    pid_file_to_check: Option<&Path>,
+    pid_file_to_check: Option<&Utf8Path>,
    status_check: &F,
 ) -> anyhow::Result<bool>
 where
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -13,6 +13,7 @@ use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
 use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};

 use utils::{
    http::{
@@ -268,7 +269,16 @@ async fn main() -> anyhow::Result<()> {
    let server = hyper::Server::from_tcp(http_listener)?.serve(service);

    tracing::info!("Serving on {0}", args.listen);
-    server.await?;
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;

    Ok(())
 }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -116,6 +116,7 @@ fn main() -> Result<()> {
            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
+            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };
@@ -816,6 +817,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    Ok(())
 }

+fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match sub_match.subcommand() {
+        Some(ep_subcommand_data) => ep_subcommand_data,
+        None => bail!("no mappings subcommand provided"),
+    };
+
+    match sub_name {
+        "map" => {
+            let branch_name = sub_args
+                .get_one::<String>("branch-name")
+                .expect("branch-name argument missing");
+
+            let tenant_id = sub_args
+                .get_one::<String>("tenant-id")
+                .map(|x| TenantId::from_str(x))
+                .expect("tenant-id argument missing")
+                .expect("malformed tenant-id arg");
+
+            let timeline_id = sub_args
+                .get_one::<String>("timeline-id")
+                .map(|x| TimelineId::from_str(x))
+                .expect("timeline-id argument missing")
+                .expect("malformed timeline-id arg");
+
+            env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
+
+            Ok(())
+        }
+        other => unimplemented!("mappings subcommand {other}"),
+    }
+}
+
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
@@ -1084,6 +1117,7 @@ fn cli() -> Command {
    // --id, when using a pageserver command
    let pageserver_id_arg = Arg::new("pageserver-id")
        .long("id")
+        .global(true)
        .help("pageserver id")
        .required(false);
    // --pageserver-id when using a non-pageserver command
@@ -1254,17 +1288,20 @@ fn cli() -> Command {
            Command::new("pageserver")
                .arg_required_else_help(true)
                .about("Manage pageserver")
+                .arg(pageserver_id_arg)
                .subcommand(Command::new("status"))
-                .arg(pageserver_id_arg.clone())
-                .subcommand(Command::new("start").about("Start local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
-                .arg(pageserver_id_arg.clone())
-                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("start")
+                    .about("Start local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
+                .subcommand(Command::new("stop")
+                    .about("Stop local pageserver")
+                    .arg(stop_mode_arg.clone())
+                )
+                .subcommand(Command::new("restart")
+                    .about("Restart local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("attachment_service")
@@ -1321,8 +1358,8 @@ fn cli() -> Command {
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg)
-                    .arg(timeline_id_arg)
+                    .arg(branch_name_arg.clone())
+                    .arg(timeline_id_arg.clone())
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
@@ -1335,7 +1372,7 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg)
+                    .arg(tenant_id_arg.clone())
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
@@ -1346,6 +1383,18 @@ fn cli() -> Command {
                )

        )
+        .subcommand(
+            Command::new("mappings")
+                .arg_required_else_help(true)
+                .about("Manage neon_local branch name mappings")
+                .subcommand(
+                    Command::new("map")
+                        .about("Create new mapping which cannot exist already")
+                        .arg(branch_name_arg.clone())
+                        .arg(tenant_id_arg.clone())
+                        .arg(timeline_id_arg.clone())
+                )
+        )
        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
        .subcommand(
            Command::new("pg")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -7,7 +7,7 @@
 //! ```
 use anyhow::Context;

-use std::path::PathBuf;
+use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

@@ -30,7 +30,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
        || {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
-                format!("Failed to append /status path to broker endpoint {url}",)
+                format!("Failed to append /status path to broker endpoint {url}")
            })?;
            let request = client
                .get(status_url)
@@ -50,6 +50,7 @@ pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
 }

-fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
-    env.base_data_dir.join("storage_broker.pid")
+fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> Utf8PathBuf {
+    Utf8PathBuf::from_path_buf(env.base_data_dir.join("storage_broker.pid"))
+        .expect("non-Unicode path")
 }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -253,7 +253,7 @@ impl Endpoint {
        conf.append("shared_buffers", "1MB");
        conf.append("fsync", "off");
        conf.append("max_connections", "100");
-        conf.append("wal_level", "replica");
+        conf.append("wal_level", "logical");
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -14,6 +14,7 @@ use std::process::{Child, Command};
 use std::{io, result};

 use anyhow::{bail, Context};
+use camino::Utf8PathBuf;
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
@@ -144,7 +145,7 @@ impl PageServerNode {
    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
        self.pageserver_init(config_overrides)
-            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
+            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -154,8 +155,9 @@ impl PageServerNode {
    /// The pid file is created by the pageserver process, with its pid stored inside.
    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
    /// pageserver runs. (Unless someone removes the file manually; never do that!)
-    fn pid_file(&self) -> PathBuf {
-        self.repo_path().join("pageserver.pid")
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.repo_path().join("pageserver.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -11,6 +11,7 @@ use std::process::Child;
 use std::{io, result};

 use anyhow::Context;
+use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
@@ -97,8 +98,9 @@ impl SafekeeperNode {
        SafekeeperNode::datadir_path_by_id(&self.env, self.id)
    }

-    pub fn pid_file(&self) -> PathBuf {
-        self.datadir_path().join("safekeeper.pid")
+    pub fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.datadir_path().join("safekeeper.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = ["RUSTSEC-2023-0052"]
+ignore = []

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -25,7 +25,7 @@
            },
            {
                "name": "wal_level",
-                "value": "replica",
+                "value": "logical",
                "vartype": "enum"
            },
            {
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -188,11 +188,60 @@ that.

 ## Error message style

+### PostgreSQL extensions
+
 PostgreSQL has a style guide for writing error messages:

 https://www.postgresql.org/docs/current/error-style-guide.html

 Follow that guide when writing error messages in the PostgreSQL
-extension. We don't follow it strictly in the pageserver and
-safekeeper, but the advice in the PostgreSQL style guide is generally
-good, and you can't go wrong by following it.
+extensions.
+
+### Neon Rust code
+
+#### Anyhow Context
+
+When adding anyhow `context()`, use form `present-tense-verb+action`.
+
+Example:
+- Bad: `file.metadata().context("could not get file metadata")?;`
+- Good: `file.metadata().context("get file metadata")?;`
+
+#### Logging Errors
+
+When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`.
+
+If `e` is an `anyhow` error and you want to log the backtrace that it contains,
+use `{e:?}` instead of `{e:#}`.
+
+#### Rationale
+
+The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `.
+
+For example, the following Rust code will result in output
+```
+ERROR  failed to list users: load users from server: parse response: invalid json
+```
+
+This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.:
+
+```
+ERROR  could not list users: could not load users from server: could not parse response: invalid json
+```
+
+
+```rust
+fn main() {
+  match list_users().context("list users") else {
+    Ok(_) => ...,
+    Err(e) => tracing::error!("failed to {e:#}"),
+  }
+}
+fn list_users() {
+  http_get_users().context("load users from server")?;
+}
+fn http_get_users() {
+  let response = client....?;
+  response.parse().context("parse response")?; // fails with serde error "invalid json"
+}
+```
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -96,6 +96,16 @@ prefix_in_bucket = '/test_prefix/'

 `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

+or
+
+```toml
+[remote_storage]
+container_name = 'some-container-name'
+container_region = 'us-east'
+prefix_in_container = '/test-prefix/'
+```
+
+`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.

 ## Repository background tasks

--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -200,6 +200,12 @@ pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
    pub options: GenericOptions,
+    // These are derived flags, not present in the spec file.
+    // They are never set by the control plane.
+    #[serde(skip_deserializing, default)]
+    pub restrict_conn: bool,
+    #[serde(skip_deserializing, default)]
+    pub invalid: bool,
 }

 /// Common type representing both SQL statement params with or without value,
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -76,7 +76,7 @@
            },
            {
                "name": "wal_level",
-                "value": "replica",
+                "value": "logical",
                "vartype": "enum"
            },
            {
--- a/libs/metrics/src/wrappers.rs
+++ b/libs/metrics/src/wrappers.rs
@@ -1,6 +1,6 @@
 use std::io::{Read, Result, Write};

-/// A wrapper for an object implementing [Read](std::io::Read)
+/// A wrapper for an object implementing [Read]
 /// which allows a closure to observe the amount of bytes read.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -51,17 +51,17 @@ impl<'a, T> CountedReader<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Read](std::io::Read) implementor
+    /// Get an immutable reference to the underlying [Read] implementor
    pub fn inner(&self) -> &T {
        &self.reader
    }

-    /// Get a mutable reference to the underlying [Read](std::io::Read) implementor
+    /// Get a mutable reference to the underlying [Read] implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.reader
    }

-    /// Consume the wrapper and return the underlying [Read](std::io::Read) implementor
+    /// Consume the wrapper and return the underlying [Read] implementor
    pub fn into_inner(self) -> T {
        self.reader
    }
@@ -75,7 +75,7 @@ impl<T: Read> Read for CountedReader<'_, T> {
    }
 }

-/// A wrapper for an object implementing [Write](std::io::Write)
+/// A wrapper for an object implementing [Write]
 /// which allows a closure to observe the amount of bytes written.
 /// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
 ///
@@ -122,17 +122,17 @@ impl<'a, T> CountedWriter<'a, T> {
        }
    }

-    /// Get an immutable reference to the underlying [Write](std::io::Write) implementor
+    /// Get an immutable reference to the underlying [Write] implementor
    pub fn inner(&self) -> &T {
        &self.writer
    }

-    /// Get a mutable reference to the underlying [Write](std::io::Write) implementor
+    /// Get a mutable reference to the underlying [Write] implementor
    pub fn inner_mut(&mut self) -> &mut T {
        &mut self.writer
    }

-    /// Consume the wrapper and return the underlying [Write](std::io::Write) implementor
+    /// Consume the wrapper and return the underlying [Write] implementor
    pub fn into_inner(self) -> T {
        self.writer
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,6 +10,7 @@ use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
    completion,
+    generation::Generation,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -218,6 +219,8 @@ impl std::ops::Deref for TenantCreateRequest {
    }
 }

+/// An alternative representation of `pageserver::tenant::TenantConf` with
+/// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
@@ -243,6 +246,39 @@ pub struct TenantConfig {
    pub gc_feedback: Option<bool>,
 }

+/// A flattened analog of a `pagesever::tenant::LocationMode`, which
+/// lists out all possible states (and the virtual "Detached" state)
+/// in a flat form rather than using rust-style enums.
+#[derive(Serialize, Deserialize, Debug)]
+pub enum LocationConfigMode {
+    AttachedSingle,
+    AttachedMulti,
+    AttachedStale,
+    Secondary,
+    Detached,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct LocationConfigSecondary {
+    pub warm: bool,
+}
+
+/// An alternative representation of `pageserver::tenant::LocationConf`,
+/// for use in external-facing APIs.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct LocationConfig {
+    pub mode: LocationConfigMode,
+    /// If attaching, in what generation?
+    #[serde(default)]
+    pub generation: Option<Generation>,
+    #[serde(default)]
+    pub secondary_conf: Option<LocationConfigSecondary>,
+
+    // If requesting mode `Secondary`, configuration for that.
+    // Custom storage configuration for the tenant, if any
+    pub tenant_conf: TenantConfig,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
@@ -253,6 +289,16 @@ pub struct StatusResponse {
    pub id: NodeId,
 }

+#[serde_as]
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLocationConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -19,8 +19,8 @@ use tracing::{debug, error, info, trace};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
-    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
-    SQLSTATE_SUCCESSFUL_COMPLETION,
+    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN,
+    SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION,
 };

 /// An error, occurred during query processing:
@@ -30,6 +30,9 @@ pub enum QueryError {
    /// The connection was lost while processing the query.
    #[error(transparent)]
    Disconnected(#[from] ConnectionError),
+    /// We were instructed to shutdown while processing the query
+    #[error("Shutting down")]
+    Shutdown,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -44,7 +47,8 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -396,7 +400,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        // socket might be already closed, e.g. if previously received error,
        // so ignore result.
        self.framed.shutdown().await.ok();
-        ret
+        match ret {
+            Ok(()) => Ok(()),
+            Err(QueryError::Shutdown) => {
+                info!("Stopped due to shutdown");
+                Ok(())
+            }
+            Err(QueryError::Disconnected(e)) => {
+                info!("Disconnected ({e:#})");
+                // Disconnection is not an error: we just use it that way internally to drop
+                // out of loops.
+                Ok(())
+            }
+            e => e,
+        }
    }

    async fn run_message_loop<F, S>(
@@ -416,15 +433,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
-                return Ok(())
+                return Err(QueryError::Shutdown)
            },

-            result = self.handshake(handler) => {
-                // Handshake complete.
-                result?;
-                if self.state == ProtoState::Closed {
-                    return Ok(()); // EOF during handshake
-                }
+            handshake_r = self.handshake(handler) => {
+                handshake_r?;
            }
        );

@@ -435,17 +448,34 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            _ = shutdown_watcher() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
-                Ok(None)
+                return Err(QueryError::Shutdown)
            },
            msg = self.read_message() => { msg },
        )? {
            trace!("got message {:?}", msg);

            let result = self.process_message(handler, msg, &mut query_string).await;
-            self.flush().await?;
+            tokio::select!(
+                biased;
+                _ = shutdown_watcher() => {
+                    // We were requested to shut down.
+                    tracing::info!("shutdown request received during response flush");
+
+                    // If we exited process_message with a shutdown error, there may be
+                    // some valid response content on in our transmit buffer: permit sending
+                    // this within a short timeout.  This is a best effort thing so we don't
+                    // care about the result.
+                    tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok();
+
+                    return Err(QueryError::Shutdown)
+                },
+                flush_r = self.flush() => {
+                    flush_r?;
+                }
+            );
+
            match result? {
                ProcessMsgResult::Continue => {
-                    self.flush().await?;
                    continue;
                }
                ProcessMsgResult::Break => break,
@@ -550,7 +580,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Ok(());
+                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
+                        ProtocolError::Protocol("EOF during handshake".to_string()),
+                    )));
                }
            }
        }
@@ -589,7 +621,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                        self.peer_addr
                    );
                    self.state = ProtoState::Closed;
-                    return Ok(());
+                    return Err(QueryError::Disconnected(ConnectionError::Protocol(
+                        ProtocolError::Protocol("EOF during auth".to_string()),
+                    )));
                }
            }
        }
@@ -913,6 +947,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Shutdown => "shutdown".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -929,6 +964,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::Shutdown => {
+            info!("query handler for '{query}' cancelled during tenant shutdown")
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -131,6 +131,7 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;

 // Export some version independent functions that are used outside of this mod
 pub use v14::xlog_utils::encode_logical_message;
+pub use v14::xlog_utils::from_pg_timestamp;
 pub use v14::xlog_utils::get_current_timestamp;
 pub use v14::xlog_utils::to_pg_timestamp;
 pub use v14::xlog_utils::XLogFileName;
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -220,6 +220,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

+/* From replication/slot.h */
+pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4  /* offset of `slotdata` in ReplicationSlotOnDisk  */
+   + 64 /* NameData */  + 4*4;
+
 /* From fsm_internals.h */
 const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
 const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -136,21 +136,42 @@ pub fn get_current_timestamp() -> TimestampTz {
    to_pg_timestamp(SystemTime::now())
 }

-pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
-    const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
-    const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
+// Module to reduce the scope of the constants
+mod timestamp_conversions {
+    use std::time::Duration;
+
+    use super::*;
+
+    const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
+    const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1)
    const SECS_PER_DAY: u64 = 86400;
    const USECS_PER_SEC: u64 = 1000000;
-    match time.duration_since(SystemTime::UNIX_EPOCH) {
-        Ok(n) => {
-            ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
-                * USECS_PER_SEC
-                + n.subsec_micros() as u64) as i64
+    const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 =
+        (POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY;
+
+    pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
+        match time.duration_since(SystemTime::UNIX_EPOCH) {
+            Ok(n) => {
+                ((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC
+                    + n.subsec_micros() as u64) as i64
+            }
+            Err(_) => panic!("SystemTime before UNIX EPOCH!"),
        }
-        Err(_) => panic!("SystemTime before UNIX EPOCH!"),
+    }
+
+    pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
+        let time: u64 = time
+            .try_into()
+            .expect("timestamp before millenium (postgres epoch)");
+        let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
+        SystemTime::UNIX_EPOCH
+            .checked_add(Duration::from_micros(since_unix_epoch))
+            .expect("SystemTime overflow")
    }
 }

+pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
+
 // Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
 // start_lsn must point to some previously known record boundary (beginning of
 // the next record). If no valid record after is found, start_lsn is returned
@@ -481,4 +502,24 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
    wal
 }

-// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_ts_conversion() {
+        let now = SystemTime::now();
+        let round_trip = from_pg_timestamp(to_pg_timestamp(now));
+
+        let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
+        let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
+        assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
+
+        let now_pg = get_current_timestamp();
+        let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
+
+        assert_eq!(now_pg, round_trip_pg);
+    }
+
+    // If you need to craft WAL and write tests for this module, put it at wal_craft crate.
+}
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -12,7 +12,7 @@ log.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
-tempfile.workspace = true
+camino-tempfile.workspace = true

 workspace_hack.workspace = true

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure};
+use camino_tempfile::{tempdir, Utf8TempDir};
 use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
@@ -8,7 +9,6 @@ use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
-use tempfile::{tempdir, TempDir};

 macro_rules! xlog_utils_test {
    ($version:ident) => {
@@ -33,7 +33,7 @@ pub struct Conf {

 pub struct PostgresServer {
    process: std::process::Child,
-    _unix_socket_dir: TempDir,
+    _unix_socket_dir: Utf8TempDir,
    client_config: postgres::Config,
 }

--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -670,6 +670,7 @@ pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
 }

 pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
+pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";

 impl<'a> BeMessage<'a> {
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,6 +13,8 @@ aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
+bytes.workspace = true
+camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 serde.workspace = true
 serde_json.workspace = true
@@ -25,8 +27,15 @@ metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
+azure_core.workspace = true
+azure_identity.workspace = true
+azure_storage.workspace = true
+azure_storage_blobs.workspace = true
+futures-util.workspace = true
+http-types.workspace = true
+itertools.workspace = true

 [dev-dependencies]
-tempfile.workspace = true
+camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -0,0 +1,356 @@
+//! Azure Blob Storage wrapper
+
+use std::env;
+use std::num::NonZeroU32;
+use std::sync::Arc;
+use std::{borrow::Cow, collections::HashMap, io::Cursor};
+
+use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use anyhow::Result;
+use azure_core::request_options::{MaxResults, Metadata, Range};
+use azure_core::Header;
+use azure_identity::DefaultAzureCredential;
+use azure_storage::StorageCredentials;
+use azure_storage_blobs::prelude::ClientBuilder;
+use azure_storage_blobs::{
+    blob::operations::GetBlobBuilder,
+    prelude::{BlobClient, ContainerClient},
+};
+use futures_util::StreamExt;
+use http_types::StatusCode;
+use tokio::io::AsyncRead;
+use tracing::debug;
+
+use crate::s3_bucket::RequestKind;
+use crate::{
+    AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage,
+    StorageMetadata,
+};
+
+pub struct AzureBlobStorage {
+    client: ContainerClient,
+    prefix_in_container: Option<String>,
+    max_keys_per_list_response: Option<NonZeroU32>,
+    concurrency_limiter: ConcurrencyLimiter,
+}
+
+impl AzureBlobStorage {
+    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
+        debug!(
+            "Creating azure remote storage for azure container {}",
+            azure_config.container_name
+        );
+
+        let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
+
+        // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
+        // otherwise try the token based credentials.
+        let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
+            StorageCredentials::access_key(account.clone(), access_key)
+        } else {
+            let token_credential = DefaultAzureCredential::default();
+            StorageCredentials::token_credential(Arc::new(token_credential))
+        };
+
+        let builder = ClientBuilder::new(account, credentials);
+
+        let client = builder.container_client(azure_config.container_name.to_owned());
+
+        let max_keys_per_list_response =
+            if let Some(limit) = azure_config.max_keys_per_list_response {
+                Some(
+                    NonZeroU32::new(limit as u32)
+                        .ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?,
+                )
+            } else {
+                None
+            };
+
+        Ok(AzureBlobStorage {
+            client,
+            prefix_in_container: azure_config.prefix_in_container.to_owned(),
+            max_keys_per_list_response,
+            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
+        })
+    }
+
+    pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        match &self.prefix_in_container {
+            Some(prefix) => {
+                if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix.clone() + path_string
+                } else {
+                    format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}")
+                }
+            }
+            None => path_string.to_string(),
+        }
+    }
+
+    fn name_to_relative_path(&self, key: &str) -> RemotePath {
+        let relative_path =
+            match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) {
+                Some(stripped) => stripped,
+                // we rely on Azure to return properly prefixed paths
+                // for requests with a certain prefix
+                None => panic!(
+                    "Key {key} does not start with container prefix {:?}",
+                    self.prefix_in_container
+                ),
+            };
+        RemotePath(
+            relative_path
+                .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                .collect(),
+        )
+    }
+
+    async fn download_for_builder(
+        &self,
+        metadata: StorageMetadata,
+        builder: GetBlobBuilder,
+    ) -> Result<Download, DownloadError> {
+        let mut response = builder.into_stream();
+
+        // TODO give proper streaming response instead of buffering into RAM
+        // https://github.com/neondatabase/neon/issues/5563
+        let mut buf = Vec::new();
+        while let Some(part) = response.next().await {
+            let part = part.map_err(to_download_error)?;
+            let data = part
+                .data
+                .collect()
+                .await
+                .map_err(|e| DownloadError::Other(e.into()))?;
+            buf.extend_from_slice(&data.slice(..));
+        }
+        Ok(Download {
+            download_stream: Box::pin(Cursor::new(buf)),
+            metadata: Some(metadata),
+        })
+    }
+    // TODO get rid of this function once we have metadata included in the response
+    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
+    async fn get_metadata(
+        &self,
+        blob_client: &BlobClient,
+    ) -> Result<StorageMetadata, DownloadError> {
+        let builder = blob_client.get_metadata();
+
+        let response = builder.into_future().await.map_err(to_download_error)?;
+        let mut map = HashMap::new();
+
+        for md in response.metadata.iter() {
+            map.insert(
+                md.name().as_str().to_string(),
+                md.value().as_str().to_string(),
+            );
+        }
+        Ok(StorageMetadata(map))
+    }
+
+    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+        self.concurrency_limiter
+            .acquire(kind)
+            .await
+            .expect("semaphore is never closed")
+    }
+}
+
+fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
+    let mut res = Metadata::new();
+    for (k, v) in metadata.0.into_iter() {
+        res.insert(k, v);
+    }
+    res
+}
+
+fn to_download_error(error: azure_core::Error) -> DownloadError {
+    if let Some(http_err) = error.as_http_error() {
+        match http_err.status() {
+            StatusCode::NotFound => DownloadError::NotFound,
+            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
+            _ => DownloadError::Other(anyhow::Error::new(error)),
+        }
+    } else {
+        DownloadError::Other(error.into())
+    }
+}
+
+#[async_trait::async_trait]
+impl RemoteStorage for AzureBlobStorage {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let list_prefix = prefix
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
+            });
+
+        let mut builder = self
+            .client
+            .list_blobs()
+            .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+
+        if let Some(prefix) = list_prefix {
+            builder = builder.prefix(Cow::from(prefix.to_owned()));
+        }
+
+        if let Some(limit) = self.max_keys_per_list_response {
+            builder = builder.max_results(MaxResults::new(limit));
+        }
+
+        let mut response = builder.into_stream();
+        let mut res = Vec::new();
+        while let Some(entry) = response.next().await {
+            let entry = entry.map_err(to_download_error)?;
+            let name_iter = entry
+                .blobs
+                .prefixes()
+                .map(|prefix| self.name_to_relative_path(&prefix.name));
+            res.extend(name_iter);
+        }
+        Ok(res)
+    }
+
+    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let folder_name = folder
+            .map(|p| self.relative_path_to_name(p))
+            .or_else(|| self.prefix_in_container.clone());
+
+        let mut builder = self.client.list_blobs();
+
+        if let Some(folder_name) = folder_name {
+            builder = builder.prefix(Cow::from(folder_name.to_owned()));
+        }
+
+        if let Some(limit) = self.max_keys_per_list_response {
+            builder = builder.max_results(MaxResults::new(limit));
+        }
+
+        let mut response = builder.into_stream();
+        let mut res = Vec::new();
+        while let Some(l) = response.next().await {
+            let entry = l.map_err(anyhow::Error::new)?;
+            let name_iter = entry
+                .blobs
+                .blobs()
+                .map(|bl| self.name_to_relative_path(&bl.name));
+            res.extend(name_iter);
+        }
+        Ok(res)
+    }
+
+    async fn upload(
+        &self,
+        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Put).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+
+        // TODO FIX THIS UGLY HACK and don't buffer the entire object
+        // into RAM here, but use the streaming interface. For that,
+        // we'd have to change the interface though...
+        // https://github.com/neondatabase/neon/issues/5563
+        let mut buf = Vec::with_capacity(data_size_bytes);
+        tokio::io::copy(&mut from, &mut buf).await?;
+        let body = azure_core::Body::Bytes(buf.into());
+
+        let mut builder = blob_client.put_block_blob(body);
+
+        if let Some(metadata) = metadata {
+            builder = builder.metadata(to_azure_metadata(metadata));
+        }
+
+        let _response = builder.into_future().await?;
+
+        Ok(())
+    }
+
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        let _permit = self.permit(RequestKind::Get).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
+
+        let metadata = self.get_metadata(&blob_client).await?;
+
+        let builder = blob_client.get();
+
+        self.download_for_builder(metadata, builder).await
+    }
+
+    async fn download_byte_range(
+        &self,
+        from: &RemotePath,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+    ) -> Result<Download, DownloadError> {
+        let _permit = self.permit(RequestKind::Get).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
+
+        let metadata = self.get_metadata(&blob_client).await?;
+
+        let mut builder = blob_client.get();
+
+        if let Some(end_exclusive) = end_exclusive {
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        } else {
+            // Open ranges are not supported by the SDK so we work around
+            // by setting the upper limit extremely high (but high enough
+            // to still be representable by signed 64 bit integers).
+            // TODO remove workaround once the SDK adds open range support
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
+            let end_exclusive = u64::MAX / 4;
+            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        }
+
+        self.download_for_builder(metadata, builder).await
+    }
+
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Delete).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+
+        let builder = blob_client.delete();
+
+        match builder.into_future().await {
+            Ok(_response) => Ok(()),
+            Err(e) => {
+                if let Some(http_err) = e.as_http_error() {
+                    if http_err.status() == StatusCode::NotFound {
+                        return Ok(());
+                    }
+                }
+                Err(anyhow::Error::new(e))
+            }
+        }
+    }
+
+    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+        // Permit is already obtained by inner delete function
+
+        // TODO batch requests are also not supported by the SDK
+        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
+        for path in paths {
+            self.delete(path).await?;
+        }
+        Ok(())
+    }
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -4,7 +4,10 @@
 //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
 //!   * [`local_fs`] allows to use local file system as an external storage
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
+//!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+
+mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
@@ -13,19 +16,23 @@ use std::{
    collections::HashMap,
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
-    path::{Path, PathBuf},
    pin::Pin,
    sync::Arc,
 };

 use anyhow::{bail, Context};
+use camino::{Utf8Path, Utf8PathBuf};

 use serde::{Deserialize, Serialize};
-use tokio::io;
+use tokio::{io, sync::Semaphore};
 use toml_edit::Item;
 use tracing::info;

-pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
+pub use self::{
+    azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
+    simulate_failures::UnreliableWrapper,
+};
+use s3_bucket::RequestKind;

 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -39,6 +46,11 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
+/// We set this a little bit low as we currently buffer the entire file into RAM
+///
+/// Here, a limit of max 20k concurrent connections was noted.
+/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -52,7 +64,7 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 /// The prefix is an implementation detail, that allows representing local paths
 /// as the remote ones, stripping the local storage prefix away.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct RemotePath(PathBuf);
+pub struct RemotePath(Utf8PathBuf);

 impl Serialize for RemotePath {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -69,18 +81,18 @@ impl<'de> Deserialize<'de> for RemotePath {
        D: serde::Deserializer<'de>,
    {
        let str = String::deserialize(deserializer)?;
-        Ok(Self(PathBuf::from(&str)))
+        Ok(Self(Utf8PathBuf::from(&str)))
    }
 }

 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0.display())
+        std::fmt::Display::fmt(&self.0, f)
    }
 }

 impl RemotePath {
-    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
+    pub fn new(relative_path: &Utf8Path) -> anyhow::Result<Self> {
        anyhow::ensure!(
            relative_path.is_relative(),
            "Path {relative_path:?} is not relative"
@@ -89,30 +101,30 @@ impl RemotePath {
    }

    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
-        Self::new(Path::new(relative_path))
+        Self::new(Utf8Path::new(relative_path))
    }

-    pub fn with_base(&self, base_path: &Path) -> PathBuf {
+    pub fn with_base(&self, base_path: &Utf8Path) -> Utf8PathBuf {
        base_path.join(&self.0)
    }

    pub fn object_name(&self) -> Option<&str> {
-        self.0.file_name().and_then(|os_str| os_str.to_str())
+        self.0.file_name()
    }

-    pub fn join(&self, segment: &Path) -> Self {
+    pub fn join(&self, segment: &Utf8Path) -> Self {
        Self(self.0.join(segment))
    }

-    pub fn get_path(&self) -> &PathBuf {
+    pub fn get_path(&self) -> &Utf8PathBuf {
        &self.0
    }

    pub fn extension(&self) -> Option<&str> {
-        self.0.extension()?.to_str()
+        self.0.extension()
    }

-    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
 }
@@ -217,6 +229,7 @@ impl std::error::Error for DownloadError {}
 pub enum GenericRemoteStorage {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
+    AzureBlob(Arc<AzureBlobStorage>),
    Unreliable(Arc<UnreliableWrapper>),
 }

@@ -228,6 +241,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_files(folder).await,
            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::AzureBlob(s) => s.list_files(folder).await,
            Self::Unreliable(s) => s.list_files(folder).await,
        }
    }
@@ -242,6 +256,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix).await,
            Self::AwsS3(s) => s.list_prefixes(prefix).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
            Self::Unreliable(s) => s.list_prefixes(prefix).await,
        }
    }
@@ -256,6 +271,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
        }
    }
@@ -264,6 +280,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.download(from).await,
            Self::AwsS3(s) => s.download(from).await,
+            Self::AzureBlob(s) => s.download(from).await,
            Self::Unreliable(s) => s.download(from).await,
        }
    }
@@ -283,6 +300,10 @@ impl GenericRemoteStorage {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
            }
+            Self::AzureBlob(s) => {
+                s.download_byte_range(from, start_inclusive, end_exclusive)
+                    .await
+            }
            Self::Unreliable(s) => {
                s.download_byte_range(from, start_inclusive, end_exclusive)
                    .await
@@ -294,6 +315,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete(path).await,
            Self::AwsS3(s) => s.delete(path).await,
+            Self::AzureBlob(s) => s.delete(path).await,
            Self::Unreliable(s) => s.delete(path).await,
        }
    }
@@ -302,6 +324,7 @@ impl GenericRemoteStorage {
        match self {
            Self::LocalFs(s) => s.delete_objects(paths).await,
            Self::AwsS3(s) => s.delete_objects(paths).await,
+            Self::AzureBlob(s) => s.delete_objects(paths).await,
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
@@ -311,7 +334,7 @@ impl GenericRemoteStorage {
    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{}' as a remote storage", root.display());
+                info!("Using fs root '{root}' as a remote storage");
                Self::LocalFs(LocalFs::new(root.clone())?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
@@ -319,6 +342,11 @@ impl GenericRemoteStorage {
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
+            RemoteStorageKind::AzureContainer(azure_config) => {
+                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
+                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
+            }
        })
    }

@@ -379,10 +407,13 @@ pub struct RemoteStorageConfig {
 pub enum RemoteStorageKind {
    /// Storage based on local file system.
    /// Specify a root folder to place all stored files into.
-    LocalFs(PathBuf),
+    LocalFs(Utf8PathBuf),
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
 }

 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
@@ -422,11 +453,45 @@ impl Debug for S3Config {
    }
 }

+/// Azure  bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq)]
+pub struct AzureConfig {
+    /// Name of the container to connect to.
+    pub container_name: String,
+    /// The region where the bucket is located at.
+    pub container_region: String,
+    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
+    pub prefix_in_container: Option<String>,
+    /// Azure has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
+    pub concurrency_limit: NonZeroUsize,
+    pub max_keys_per_list_response: Option<i32>,
+}
+
+impl Debug for AzureConfig {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("AzureConfig")
+            .field("bucket_name", &self.container_name)
+            .field("bucket_region", &self.container_region)
+            .field("prefix_in_bucket", &self.prefix_in_container)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .field(
+                "max_keys_per_list_response",
+                &self.max_keys_per_list_response,
+            )
+            .finish()
+    }
+}
+
 impl RemoteStorageConfig {
    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
        let local_path = toml.get("local_path");
        let bucket_name = toml.get("bucket_name");
        let bucket_region = toml.get("bucket_region");
+        let container_name = toml.get("container_name");
+        let container_region = toml.get("container_region");
+
+        let use_azure = container_name.is_some() && container_region.is_some();

        let max_concurrent_syncs = NonZeroUsize::new(
            parse_optional_integer("max_concurrent_syncs", toml)?
@@ -440,9 +505,13 @@ impl RemoteStorageConfig {
        )
        .context("Failed to parse 'max_sync_errors' as a positive integer")?;

+        let default_concurrency_limit = if use_azure {
+            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
+        } else {
+            DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
+        };
        let concurrency_limit = NonZeroUsize::new(
-            parse_optional_integer("concurrency_limit", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
+            parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
        )
        .context("Failed to parse 'concurrency_limit' as a positive integer")?;

@@ -451,33 +520,70 @@ impl RemoteStorageConfig {
                .context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
                .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);

-        let storage = match (local_path, bucket_name, bucket_region) {
+        let endpoint = toml
+            .get("endpoint")
+            .map(|endpoint| parse_toml_string("endpoint", endpoint))
+            .transpose()?;
+
+        let storage = match (
+            local_path,
+            bucket_name,
+            bucket_region,
+            container_name,
+            container_region,
+        ) {
            // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
-            (None, None, None) => return Ok(None),
-            (_, Some(_), None) => {
+            (None, None, None, None, None) => return Ok(None),
+            (_, Some(_), None, ..) => {
                bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
            }
-            (_, None, Some(_)) => {
+            (_, None, Some(_), ..) => {
                bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
            }
-            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
-                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
-                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                prefix_in_bucket: toml
-                    .get("prefix_in_bucket")
-                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
-                    .transpose()?,
-                endpoint: toml
-                    .get("endpoint")
-                    .map(|endpoint| parse_toml_string("endpoint", endpoint))
-                    .transpose()?,
-                concurrency_limit,
-                max_keys_per_list_response,
-            }),
-            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
-                parse_toml_string("local_path", local_path)?,
-            )),
-            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
+            (None, Some(bucket_name), Some(bucket_region), ..) => {
+                RemoteStorageKind::AwsS3(S3Config {
+                    bucket_name: parse_toml_string("bucket_name", bucket_name)?,
+                    bucket_region: parse_toml_string("bucket_region", bucket_region)?,
+                    prefix_in_bucket: toml
+                        .get("prefix_in_bucket")
+                        .map(|prefix_in_bucket| {
+                            parse_toml_string("prefix_in_bucket", prefix_in_bucket)
+                        })
+                        .transpose()?,
+                    endpoint,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                })
+            }
+            (_, _, _, Some(_), None) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (_, _, _, None, Some(_)) => {
+                bail!("'container_name' option is mandatory if 'container_region' is given ")
+            }
+            (None, None, None, Some(container_name), Some(container_region)) => {
+                RemoteStorageKind::AzureContainer(AzureConfig {
+                    container_name: parse_toml_string("container_name", container_name)?,
+                    container_region: parse_toml_string("container_region", container_region)?,
+                    prefix_in_container: toml
+                        .get("prefix_in_container")
+                        .map(|prefix_in_container| {
+                            parse_toml_string("prefix_in_container", prefix_in_container)
+                        })
+                        .transpose()?,
+                    concurrency_limit,
+                    max_keys_per_list_response,
+                })
+            }
+            (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
+                Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
+            ),
+            (Some(_), Some(_), ..) => {
+                bail!("'local_path' and 'bucket_name' are mutually exclusive")
+            }
+            (Some(_), _, _, Some(_), Some(_)) => {
+                bail!("local_path and 'container_name' are mutually exclusive")
+            }
        };

        Ok(Some(RemoteStorageConfig {
@@ -513,29 +619,69 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
    Ok(s.to_string())
 }

+struct ConcurrencyLimiter {
+    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
+    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
+    // The helps to ensure we don't exceed the thresholds.
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;

    #[test]
    fn test_object_name() {
-        let k = RemotePath::new(Path::new("a/b/c")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/b/c/")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemotePath::new(Path::new("a/")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/")).unwrap();
        assert_eq!(k.object_name(), Some("a"));

        // XXX is it impossible to have an empty key?
-        let k = RemotePath::new(Path::new("")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("")).unwrap();
        assert_eq!(k.object_name(), None);
    }

    #[test]
    fn rempte_path_cannot_be_created_from_absolute_ones() {
-        let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
+        let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
        assert_eq!(err.to_string(), "Path \"/\" is not relative");
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,15 +4,10 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{
-    borrow::Cow,
-    future::Future,
-    io::ErrorKind,
-    path::{Path, PathBuf},
-    pin::Pin,
-};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};

 use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
 use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
@@ -28,20 +23,20 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";

 #[derive(Debug, Clone)]
 pub struct LocalFs {
-    storage_root: PathBuf,
+    storage_root: Utf8PathBuf,
 }

 impl LocalFs {
    /// Attempts to create local FS storage, along with its root directory.
    /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
-    pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
+    pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
        if !storage_root.exists() {
            std::fs::create_dir_all(&storage_root).with_context(|| {
                format!("Failed to create all directories in the given root path {storage_root:?}")
            })?;
        }
        if !storage_root.is_absolute() {
-            storage_root = storage_root.canonicalize().with_context(|| {
+            storage_root = storage_root.canonicalize_utf8().with_context(|| {
                format!("Failed to represent path {storage_root:?} as an absolute path")
            })?;
        }
@@ -50,7 +45,7 @@ impl LocalFs {
    }

    // mirrors S3Bucket::s3_object_to_relative_path
-    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
+    fn local_file_to_relative_path(&self, key: Utf8PathBuf) -> RemotePath {
        let relative_path = key
            .strip_prefix(&self.storage_root)
            .expect("relative path must contain storage_root as prefix");
@@ -59,22 +54,18 @@ impl LocalFs {

    async fn read_storage_metadata(
        &self,
-        file_path: &Path,
+        file_path: &Utf8Path,
    ) -> anyhow::Result<Option<StorageMetadata>> {
        let metadata_path = storage_metadata_path(file_path);
        if metadata_path.exists() && metadata_path.is_file() {
            let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| {
-                format!(
-                    "Failed to read metadata from the local storage at '{}'",
-                    metadata_path.display()
-                )
+                format!("Failed to read metadata from the local storage at '{metadata_path}'")
            })?;

            serde_json::from_str(&metadata_string)
                .with_context(|| {
                    format!(
-                        "Failed to deserialize metadata from the local storage at '{}'",
-                        metadata_path.display()
+                        "Failed to deserialize metadata from the local storage at '{metadata_path}'",
                    )
                })
                .map(|metadata| Some(StorageMetadata(metadata)))
@@ -171,25 +162,21 @@ impl RemoteStorage for LocalFs {
            }
        }

-        // Note that PathBuf starts_with only considers full path segments, but
+        // Note that Utf8PathBuf starts_with only considers full path segments, but
        // object prefixes are arbitrary strings, so we need the strings for doing
        // starts_with later.
-        let prefix = full_path.to_string_lossy();
+        let prefix = full_path.as_str();

        let mut files = vec![];
-        let mut directory_queue = vec![initial_dir.clone()];
+        let mut directory_queue = vec![initial_dir];
        while let Some(cur_folder) = directory_queue.pop() {
-            let mut entries = fs::read_dir(cur_folder.clone()).await?;
-            while let Some(entry) = entries.next_entry().await? {
-                let file_name: PathBuf = entry.file_name().into();
-                let full_file_name = cur_folder.clone().join(&file_name);
-                if full_file_name
-                    .to_str()
-                    .map(|s| s.starts_with(prefix.as_ref()))
-                    .unwrap_or(false)
-                {
+            let mut entries = cur_folder.read_dir_utf8()?;
+            while let Some(Ok(entry)) = entries.next() {
+                let file_name = entry.file_name();
+                let full_file_name = cur_folder.join(file_name);
+                if full_file_name.as_str().starts_with(prefix) {
                    let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
-                    files.push(file_remote_path.clone());
+                    files.push(file_remote_path);
                    if full_file_name.is_dir() {
                        directory_queue.push(full_file_name);
                    }
@@ -230,10 +217,7 @@ impl RemoteStorage for LocalFs {
                .open(&temp_file_path)
                .await
                .with_context(|| {
-                    format!(
-                        "Failed to open target fs destination at '{}'",
-                        target_file_path.display()
-                    )
+                    format!("Failed to open target fs destination at '{target_file_path}'")
                })?,
        );

@@ -244,8 +228,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload file (write temp) to the local storage at '{}'",
-                    temp_file_path.display()
+                    "Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
                )
            })?;

@@ -262,8 +245,7 @@ impl RemoteStorage for LocalFs {

        destination.flush().await.with_context(|| {
            format!(
-                "Failed to upload (flush temp) file to the local storage at '{}'",
-                temp_file_path.display()
+                "Failed to upload (flush temp) file to the local storage at '{temp_file_path}'",
            )
        })?;

@@ -271,8 +253,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload (rename) file to the local storage at '{}'",
-                    target_file_path.display()
+                    "Failed to upload (rename) file to the local storage at '{target_file_path}'",
                )
            })?;

@@ -286,8 +267,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to write metadata to the local storage at '{}'",
-                    storage_metadata_path.display()
+                    "Failed to write metadata to the local storage at '{storage_metadata_path}'",
                )
            })?;
        }
@@ -393,16 +373,16 @@ impl RemoteStorage for LocalFs {
    }
 }

-fn storage_metadata_path(original_path: &Path) -> PathBuf {
+fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
 where
-    P: AsRef<Path> + Send + Sync + 'a,
+    P: AsRef<Utf8Path> + Send + Sync + 'a,
 {
    Box::pin(async move {
        let directory_path = directory_path.as_ref();
@@ -412,7 +392,13 @@ where
                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
-                    let entry_path = dir_entry.path();
+                    let entry_path =
+                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                            anyhow::Error::msg(format!(
+                                "non-Unicode path: {}",
+                                pb.to_string_lossy()
+                            ))
+                        })?;
                    if file_type.is_symlink() {
                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
@@ -435,13 +421,10 @@ where
    })
 }

-async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
+async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
-        None => bail!(
-            "File path '{}' has no parent directory",
-            target_file_path.display()
-        ),
+        None => bail!("File path '{target_file_path}' has no parent directory"),
    };
    if !target_dir.exists() {
        fs::create_dir_all(target_dir).await?;
@@ -449,13 +432,9 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
    Ok(())
 }

-fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
+fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
    if file_path.exists() {
-        ensure!(
-            file_path.is_file(),
-            "file path '{}' is not a file",
-            file_path.display()
-        );
+        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
        Ok(true)
    } else {
        Ok(false)
@@ -466,13 +445,13 @@ fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use camino_tempfile::tempdir;
    use std::{collections::HashMap, io::Write};
-    use tempfile::tempdir;

    async fn read_and_assert_remote_file_contents(
        storage: &LocalFs,
        #[allow(clippy::ptr_arg)]
-        // have to use &PathBuf due to `storage.local_path` parameter requirements
+        // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
@@ -519,7 +498,7 @@ mod fs_tests {
    async fn upload_file_negatives() -> anyhow::Result<()> {
        let storage = create_storage()?;

-        let id = RemotePath::new(Path::new("dummy"))?;
+        let id = RemotePath::new(Utf8Path::new("dummy"))?;
        let content = std::io::Cursor::new(b"12345");

        // Check that you get an error if the size parameter doesn't match the actual
@@ -544,7 +523,8 @@ mod fs_tests {
    }

    fn create_storage() -> anyhow::Result<LocalFs> {
-        LocalFs::new(tempdir()?.path().to_owned())
+        let storage_root = tempdir()?.path().to_path_buf();
+        LocalFs::new(storage_root)
    }

    #[tokio::test]
@@ -561,7 +541,7 @@ mod fs_tests {
        );

        let non_existing_path = "somewhere/else";
-        match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
+        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
@@ -775,7 +755,7 @@ mod fs_tests {
    }

    async fn create_file_for_upload(
-        path: &Path,
+        path: &Utf8Path,
        contents: &str,
    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
        std::fs::create_dir_all(path.parent().unwrap())?;
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,7 +4,7 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::sync::Arc;
+use std::borrow::Cow;

 use anyhow::Context;
 use aws_config::{
@@ -24,22 +24,20 @@ use aws_sdk_s3::{
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::{
-    io::{self, AsyncRead},
-    sync::Semaphore,
-};
+use tokio::io::{self, AsyncRead};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

 use super::StorageMetadata;
 use crate::{
-    Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;

-use self::metrics::{AttemptOutcome, RequestKind};
+use self::metrics::AttemptOutcome;
+pub(super) use self::metrics::RequestKind;

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -47,10 +45,7 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
-    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
-    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
-    // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<Semaphore>,
+    concurrency_limiter: ConcurrencyLimiter,
 }

 #[derive(Default)]
@@ -117,7 +112,7 @@ impl S3Bucket {
            bucket_name: aws_config.bucket_name.clone(),
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
-            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
        })
    }

@@ -143,12 +138,11 @@ impl S3Bucket {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
        let path_string = path
            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+            Some(prefix) => prefix.clone() + "/" + path_string,
+            None => path_string.to_string(),
        }
    }

@@ -156,7 +150,7 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .acquire()
+            .acquire(kind)
            .await
            .expect("semaphore is never closed");

@@ -172,8 +166,7 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .clone()
-            .acquire_owned()
+            .acquire_owned(kind)
            .await
            .expect("semaphore is never closed");

@@ -521,6 +514,20 @@ impl RemoteStorage for S3Bucket {
                        .deleted_objects_total
                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
+                        // Log a bounded number of the errors within the response:
+                        // these requests can carry 1000 keys so logging each one
+                        // would be too verbose, especially as errors may lead us
+                        // to retry repeatedly.
+                        const LOG_UP_TO_N_ERRORS: usize = 10;
+                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                            tracing::warn!(
+                                "DeleteObjects key {} failed: {}: {}",
+                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                            );
+                        }
+
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -565,8 +572,8 @@ fn start_measuring_requests(

 #[cfg(test)]
 mod tests {
+    use camino::Utf8Path;
    use std::num::NonZeroUsize;
-    use std::path::Path;

    use crate::{RemotePath, S3Bucket, S3Config};

@@ -575,7 +582,7 @@ mod tests {
        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .map(|x| RemotePath::new(Utf8Path::new(x)).expect("bad path"))
            .collect();
        let prefixes = [
            None,
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -6,7 +6,7 @@ use once_cell::sync::Lazy;
 pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
-pub(super) enum RequestKind {
+pub(crate) enum RequestKind {
    Get = 0,
    Put = 1,
    Delete = 2,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -0,0 +1,625 @@
+use std::collections::HashSet;
+use std::env;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::ops::ControlFlow;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::UNIX_EPOCH;
+
+use anyhow::Context;
+use camino::Utf8Path;
+use once_cell::sync::OnceCell;
+use remote_storage::{
+    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+};
+use test_context::{test_context, AsyncTestContext};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
+const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
+
+const BASE_PREFIX: &str = "test";
+
+/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
+/// See the client creation in [`create_azure_client`] for details on the required env vars.
+/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
+/// where
+/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
+///
+/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledAzureWithTestBlobs)]
+#[tokio::test]
+async fn azure_pagination_should_work(
+    ctx: &mut MaybeEnabledAzureWithTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
+/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `Azure_pagination_should_work` for more information.
+///
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
+#[tokio::test]
+async fn azure_list_files_works(
+    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("Azure init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledAzure::Enabled(ctx) => ctx,
+        MaybeEnabledAzure::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let data1 = "remote blob data1".as_bytes();
+    let data1_len = data1.len();
+    let data2 = "remote blob data2".as_bytes();
+    let data2_len = data2.len();
+    let data3 = "remote blob data3".as_bytes();
+    let data3_len = data3.len();
+    ctx.client
+        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
+        .await?;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+        .await?;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+        .await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledAzure)]
+#[tokio::test]
+async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
+    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let data = "remote blob data here".as_bytes();
+    let data_len = data.len() as u64;
+
+    ctx.client
+        .upload(std::io::Cursor::new(data), data.len(), &path, None)
+        .await?;
+
+    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
+        let mut buf = Vec::new();
+        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
+        Ok(buf)
+    }
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(data_len))
+        .await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(data_len * 100))
+        .await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_and_compare(dl).await?;
+    assert_eq!(buf, data);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
+
+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
+    });
+}
+
+struct EnabledAzure {
+    client: Arc<GenericRemoteStorage>,
+    base_prefix: &'static str,
+}
+
+impl EnabledAzure {
+    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
+        let client = create_azure_client(max_keys_in_list_response)
+            .context("Azure client creation")
+            .expect("Azure client creation failed");
+
+        EnabledAzure {
+            client,
+            base_prefix: BASE_PREFIX,
+        }
+    }
+}
+
+enum MaybeEnabledAzure {
+    Enabled(EnabledAzure),
+    Disabled,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzure {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        Self::Enabled(EnabledAzure::setup(None).await)
+    }
+}
+
+enum MaybeEnabledAzureWithTestBlobs {
+    Enabled(AzureWithTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
+}
+
+struct AzureWithTestBlobs {
+    enabled: EnabledAzure,
+    remote_prefixes: HashSet<RemotePath>,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(AzureWithTestBlobs {
+                    enabled,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
+                AzureWithTestBlobs {
+                    enabled,
+                    remote_prefixes: uploads.prefixes,
+                    remote_blobs: uploads.blobs,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
+enum MaybeEnabledAzureWithSimpleTestBlobs {
+    Enabled(AzureWithSimpleTestBlobs),
+    Disabled,
+    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
+}
+struct AzureWithSimpleTestBlobs {
+    enabled: EnabledAzure,
+    remote_blobs: HashSet<RemotePath>,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        let max_keys_in_list_response = 10;
+        let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
+
+        let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
+
+        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
+            ControlFlow::Continue(uploads) => {
+                info!("Remote objects created successfully");
+
+                Self::Enabled(AzureWithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                })
+            }
+            ControlFlow::Break(uploads) => Self::UploadsFailed(
+                anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
+                AzureWithSimpleTestBlobs {
+                    enabled,
+                    remote_blobs: uploads,
+                },
+            ),
+        }
+    }
+
+    async fn teardown(self) {
+        match self {
+            Self::Disabled => {}
+            Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
+            }
+        }
+    }
+}
+
+fn create_azure_client(
+    max_keys_per_list_response: Option<i32>,
+) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    use rand::Rng;
+
+    let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context(
+        "`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled",
+    )?;
+    let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context(
+        "`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled",
+    )?;
+
+    // due to how time works, we've had test runners use the same nanos as bucket prefixes.
+    // millis is just a debugging aid for easier finding the prefix later.
+    let millis = std::time::SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .context("random Azure test prefix part calculation")?
+        .as_millis();
+
+    // because nanos can be the same for two threads so can millis, add randomness
+    let random = rand::thread_rng().gen::<u32>();
+
+    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
+        storage: RemoteStorageKind::AzureContainer(AzureConfig {
+            container_name: remote_storage_azure_container,
+            container_region: remote_storage_azure_region,
+            prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response,
+        }),
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
+    ))
+}
+
+struct Uploads {
+    prefixes: HashSet<RemotePath>,
+    blobs: HashSet<RemotePath>,
+}
+
+async fn upload_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+async fn upload_simple_azure_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} Azure files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let data = format!("remote blob data {i}").into_bytes();
+            let data_len = data.len();
+            task_client
+                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
+                .await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -2,11 +2,12 @@ use std::collections::HashSet;
 use std::env;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use camino::Utf8Path;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -55,7 +56,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();

-    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
@@ -108,7 +109,7 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
    };
    let test_client = Arc::clone(&ctx.enabled.client);
    let base_prefix =
-        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
        .list_files(None)
        .await
@@ -129,9 +130,9 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
    let trim_remote_blobs: HashSet<_> = ctx
        .remote_blobs
        .iter()
-        .map(|x| x.get_path().to_str().expect("must be valid name"))
+        .map(|x| x.get_path())
        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
@@ -148,10 +149,9 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result
        MaybeEnabledS3::Disabled => return Ok(()),
    };

-    let path = RemotePath::new(&PathBuf::from(format!(
-        "{}/for_sure_there_is_nothing_there_really",
-        ctx.base_prefix,
-    )))
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
    .with_context(|| "RemotePath conversion")?;

    ctx.client.delete(&path).await.expect("should succeed");
@@ -167,13 +167,13 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
        MaybeEnabledS3::Disabled => return Ok(()),
    };

-    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

    let data1 = "remote blob data1".as_bytes();
@@ -427,10 +427,10 @@ async fn upload_s3_data(
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
-            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
-            let blob_prefix = RemotePath::new(&prefix)
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

            let data = format!("remote blob data {i}").into_bytes();
@@ -512,8 +512,10 @@ async fn upload_simple_s3_data(
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(&blob_path)
-                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

            let data = format!("remote blob data {i}").into_bytes();
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -10,6 +10,7 @@ async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
+camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
@@ -53,7 +54,7 @@ byteorder.workspace = true
 bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
-tempfile.workspace = true
+camino-tempfile.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -2,9 +2,9 @@

 use serde;
 use std::fs;
-use std::path::Path;

 use anyhow::Result;
+use camino::Utf8Path;
 use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
@@ -65,7 +65,7 @@ impl JwtAuth {
        }
    }

-    pub fn from_key_path(key_path: &Path) -> Result<Self> {
+    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
        let public_key = fs::read(key_path)?;
        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
    }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,14 +1,14 @@
 use std::{
    borrow::Cow,
-    ffi::OsStr,
    fs::{self, File},
    io,
-    path::{Path, PathBuf},
 };

+use camino::{Utf8Path, Utf8PathBuf};
+
 /// Similar to [`std::fs::create_dir`], except we fsync the
 /// created directory and its parent.
-pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
+pub fn create_dir(path: impl AsRef<Utf8Path>) -> io::Result<()> {
    let path = path.as_ref();

    fs::create_dir(path)?;
@@ -18,7 +18,7 @@ pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {

 /// Similar to [`std::fs::create_dir_all`], except we fsync all
 /// newly created directories and the pre-existing parent.
-pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
+pub fn create_dir_all(path: impl AsRef<Utf8Path>) -> io::Result<()> {
    let mut path = path.as_ref();

    let mut dirs_to_create = Vec::new();
@@ -30,7 +30,7 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
            Ok(_) => {
                return Err(io::Error::new(
                    io::ErrorKind::AlreadyExists,
-                    format!("non-directory found in path: {}", path.display()),
+                    format!("non-directory found in path: {path}"),
                ));
            }
            Err(ref e) if e.kind() == io::ErrorKind::NotFound => {}
@@ -44,7 +44,7 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
            None => {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidInput,
-                    format!("can't find parent of path '{}'", path.display()).as_str(),
+                    format!("can't find parent of path '{path}'"),
                ));
            }
        }
@@ -70,21 +70,18 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {

 /// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension,
 /// or if there's no extension, creates one and puts a suffix there.
-pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
-    let new_extension = match original_path
-        .as_ref()
-        .extension()
-        .map(OsStr::to_string_lossy)
-    {
+pub fn path_with_suffix_extension(
+    original_path: impl AsRef<Utf8Path>,
+    suffix: &str,
+) -> Utf8PathBuf {
+    let new_extension = match original_path.as_ref().extension() {
        Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
        None => Cow::Borrowed(suffix),
    };
-    original_path
-        .as_ref()
-        .with_extension(new_extension.as_ref())
+    original_path.as_ref().with_extension(new_extension)
 }

-pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
+pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
    let parent = file_path.parent().ok_or_else(|| {
        io::Error::new(
            io::ErrorKind::Other,
@@ -97,7 +94,7 @@ pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
    Ok(())
 }

-pub fn fsync(path: &Path) -> io::Result<()> {
+pub fn fsync(path: &Utf8Path) -> io::Result<()> {
    File::open(path)
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
        .and_then(|file| {
@@ -111,19 +108,18 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

-pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    tokio::fs::File::open(path).await?.sync_all().await
+pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Error> {
+    tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }

 #[cfg(test)]
 mod tests {
-    use tempfile::tempdir;

    use super::*;

    #[test]
    fn test_create_dir_fsyncd() {
-        let dir = tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();

        let existing_dir_path = dir.path();
        let err = create_dir(existing_dir_path).unwrap_err();
@@ -139,7 +135,7 @@ mod tests {

    #[test]
    fn test_create_dir_all_fsyncd() {
-        let dir = tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();

        let existing_dir_path = dir.path();
        create_dir_all(existing_dir_path).unwrap();
@@ -166,29 +162,29 @@ mod tests {

    #[test]
    fn test_path_with_suffix_extension() {
-        let p = PathBuf::from("/foo/bar");
+        let p = Utf8PathBuf::from("/foo/bar");
        assert_eq!(
-            &path_with_suffix_extension(p, "temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp").to_string(),
            "/foo/bar.temp"
        );
-        let p = PathBuf::from("/foo/bar");
+        let p = Utf8PathBuf::from("/foo/bar");
        assert_eq!(
-            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string(),
            "/foo/bar.temp.temp"
        );
-        let p = PathBuf::from("/foo/bar.baz");
+        let p = Utf8PathBuf::from("/foo/bar.baz");
        assert_eq!(
-            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string(),
            "/foo/bar.baz.temp.temp"
        );
-        let p = PathBuf::from("/foo/bar.baz");
+        let p = Utf8PathBuf::from("/foo/bar.baz");
        assert_eq!(
-            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string(),
            "/foo/bar.baz..temp"
        );
-        let p = PathBuf::from("/foo/bar/dir/");
+        let p = Utf8PathBuf::from("/foo/bar/dir/");
        assert_eq!(
-            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string(),
            "/foo/bar/dir..temp"
        );
    }
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -55,8 +55,6 @@ where

 #[cfg(test)]
 mod test {
-    use std::path::PathBuf;
-
    use crate::fs_ext::{is_directory_empty, list_dir};

    use super::ignore_absent_files;
@@ -65,7 +63,7 @@ mod test {
    fn is_empty_dir() {
        use super::PathExt;

-        let dir = tempfile::tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();
        let dir_path = dir.path();

        // test positive case
@@ -75,7 +73,7 @@ mod test {
        );

        // invoke on a file to ensure it returns an error
-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir_path.join("testfile");
        let f = std::fs::File::create(&file_path).unwrap();
        drop(f);
        assert!(file_path.is_empty_dir().is_err());
@@ -87,7 +85,7 @@ mod test {

    #[tokio::test]
    async fn is_empty_dir_async() {
-        let dir = tempfile::tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();
        let dir_path = dir.path();

        // test positive case
@@ -97,7 +95,7 @@ mod test {
        );

        // invoke on a file to ensure it returns an error
-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir_path.join("testfile");
        let f = std::fs::File::create(&file_path).unwrap();
        drop(f);
        assert!(is_directory_empty(&file_path).await.is_err());
@@ -109,10 +107,9 @@ mod test {

    #[test]
    fn ignore_absent_files_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
+        let dir = camino_tempfile::tempdir().unwrap();

-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir.path().join("testfile");

        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");

@@ -126,17 +123,17 @@ mod test {

    #[tokio::test]
    async fn list_dir_works() {
-        let dir = tempfile::tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();
        let dir_path = dir.path();

        assert!(list_dir(dir_path).await.unwrap().is_empty());

-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir_path.join("testfile");
        let _ = std::fs::File::create(&file_path).unwrap();

        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);

-        let another_dir_path: PathBuf = dir_path.join("testdir");
+        let another_dir_path = dir_path.join("testdir");
        std::fs::create_dir(another_dir_path).unwrap();

        let expected = &["testdir", "testfile"];
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,8 +1,9 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::error;
+use tracing::{error, info};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -24,6 +25,9 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

+    #[error("Resource temporarily unavailable: {0}")]
+    ResourceUnavailable(Cow<'static, str>),
+
    #[error("Shutting down")]
    ShuttingDown,

@@ -59,6 +63,10 @@ impl ApiError {
                "Shutting down".to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
+            ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::SERVICE_UNAVAILABLE,
+            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
@@ -108,10 +116,12 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {

 pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error {
-        error!("Error processing HTTP request: {api_error:?}");
-    } else {
-        error!("Error processing HTTP request: {api_error:#}");
+
+    match api_error {
+        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
+        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
+        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
+        _ => error!("Error processing HTTP request: {api_error:#}"),
    }

    api_error.into_response()
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,4 +1,3 @@
-use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

 use anyhow::Context;
@@ -215,12 +214,11 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

-impl TryFrom<Option<&OsStr>> for TimelineId {
+impl TryFrom<Option<&str>> for TimelineId {
    type Error = anyhow::Error;

-    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
+    fn try_from(value: Option<&str>) -> Result<Self, Self::Error> {
        value
-            .and_then(OsStr::to_str)
            .unwrap_or_default()
            .parse::<TimelineId>()
            .with_context(|| format!("Could not parse timeline id from {:?}", value))
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -11,10 +11,10 @@ use std::{
    io::{Read, Write},
    ops::Deref,
    os::unix::prelude::AsRawFd,
-    path::{Path, PathBuf},
 };

 use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 use nix::{errno::Errno::EAGAIN, fcntl};

 use crate::crashsafe;
@@ -23,7 +23,7 @@ use crate::crashsafe;
 /// Returned by [`create_exclusive`].
 #[must_use]
 pub struct UnwrittenLockFile {
-    path: PathBuf,
+    path: Utf8PathBuf,
    file: fs::File,
 }

@@ -60,7 +60,7 @@ impl UnwrittenLockFile {
 ///
 /// It is not an error if the file already exists.
 /// It is an error if the file is already locked.
-pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
+pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
        .write(true)
@@ -101,7 +101,7 @@ pub enum LockFileRead {
 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
 /// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
-pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
+pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
    let mut lock_file = match res {
        Ok(f) => f,
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -228,6 +228,12 @@ impl SecretString {
    }
 }

+impl From<String> for SecretString {
+    fn from(s: String) -> Self {
+        Self(s)
+    }
+}
+
 impl std::fmt::Debug for SecretString {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "[SECRET]")
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,9 +1,9 @@
 #![warn(missing_docs)]

+use camino::Utf8Path;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
-use std::path::Path;
 use std::str::FromStr;
 use std::sync::atomic::{AtomicU64, Ordering};

@@ -44,11 +44,9 @@ impl Lsn {
    /// Parse an LSN from a filename in the form `0000000000000000`
    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
    where
-        F: AsRef<Path>,
+        F: AsRef<Utf8Path>,
    {
-        let filename: &Path = filename.as_ref();
-        let filename = filename.to_str().ok_or(LsnParseError)?;
-        Lsn::from_hex(filename)
+        Lsn::from_hex(filename.as_ref().as_str())
    }

    /// Parse an LSN from a string in the form `0000000000000000`
--- a/libs/utils/src/pid_file.rs
+++ b/libs/utils/src/pid_file.rs
@@ -49,9 +49,10 @@
 //! At this point, `B` and `C` are running, which is hazardous.
 //! Morale of the story: don't unlink pidfiles, ever.

-use std::{ops::Deref, path::Path};
+use std::ops::Deref;

 use anyhow::Context;
+use camino::Utf8Path;
 use nix::unistd::Pid;

 use crate::lock_file::{self, LockFileRead};
@@ -84,7 +85,7 @@ impl Deref for PidFileGuard {
 /// The claim ends as soon as the returned guard object is dropped.
 /// To maintain the claim for the remaining lifetime of the current process,
 /// use [`std::mem::forget`] or similar.
-pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
+pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result<PidFileGuard> {
    let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
    // if any of the next steps fail, we drop the file descriptor and thereby release the lock
    let guard = unwritten_lock_file
@@ -132,7 +133,7 @@ pub enum PidFileRead {
 ///
 /// On success, this function returns a [`PidFileRead`].
 /// Check its docs for a description of the meaning of its different variants.
-pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
+pub fn read(pidfile: &Utf8Path) -> anyhow::Result<PidFileRead> {
    let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
    let ret = match res {
        LockFileRead::NotExist => PidFileRead::NotExist,
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -58,7 +58,7 @@ where
 // to get that.
 impl<T: Ord> PartialOrd for Waiter<T> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        other.wake_num.partial_cmp(&self.wake_num)
+        Some(self.cmp(other))
    }
 }

--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -27,8 +27,8 @@ and old one if it exists.
 * the filecache: a struct that allows communication with the Postgres file cache.
 On startup, we connect to the filecache and hold on to the connection for the
 entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
-listening for `memory.high` events and setting its `memory.{high,max}` values.
+* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
+usage and sends rolling aggregates to the runner.
 * the runner: the runner marries the filecache and cgroup watcher together,
 communicating with the agent throught the `Dispatcher`, and then calling filecache
 and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,161 +1,38 @@
-use std::{
-    fmt::{Debug, Display},
-    fs,
-    pin::pin,
-    sync::atomic::{AtomicU64, Ordering},
-};
+use std::fmt::{self, Debug, Formatter};
+use std::time::{Duration, Instant};

-use anyhow::{anyhow, bail, Context};
+use anyhow::{anyhow, Context};
 use cgroups_rs::{
-    freezer::FreezerController,
-    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    hierarchies::{self, is_cgroup2_unified_mode},
    memory::MemController,
-    MaxValue,
-    Subsystem::{Freezer, Mem},
+    Subsystem,
 };
-use inotify::{EventStream, Inotify, WatchMask};
-use tokio::sync::mpsc::{self, error::TryRecvError};
-use tokio::time::{Duration, Instant};
-use tokio_stream::{Stream, StreamExt};
+use tokio::sync::watch;
 use tracing::{info, warn};

-use crate::protocol::Resources;
-use crate::MiB;
-
-/// Monotonically increasing counter of the number of memory.high events
-/// the cgroup has experienced.
-///
-/// We use this to determine if a modification to the `memory.events` file actually
-/// changed the `high` field. If not, we don't care about the change. When we
-/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
-/// to see if it changed since last time.
-pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
-
-/// Monotonically increasing counter that gives each cgroup event a unique id.
-///
-/// This allows us to answer questions like "did this upscale arrive before this
-/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
-/// with a sequence number. As such, prefer to used the `Sequenced` type rather
-/// than this static directly.
-static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
-
-/// A memory event type reported in memory.events.
-#[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub enum MemoryEvent {
-    Low,
-    High,
-    Max,
-    Oom,
-    OomKill,
-    OomGroupKill,
-}
-
-impl MemoryEvent {
-    fn as_str(&self) -> &str {
-        match self {
-            MemoryEvent::Low => "low",
-            MemoryEvent::High => "high",
-            MemoryEvent::Max => "max",
-            MemoryEvent::Oom => "oom",
-            MemoryEvent::OomKill => "oom_kill",
-            MemoryEvent::OomGroupKill => "oom_group_kill",
-        }
-    }
-}
-
-impl Display for MemoryEvent {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.as_str())
-    }
-}
-
 /// Configuration for a `CgroupWatcher`
 #[derive(Debug, Clone)]
 pub struct Config {
-    // The target difference between the total memory reserved for the cgroup
-    // and the value of the cgroup's memory.high.
-    //
-    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
-    // use (equal to system memory, minus whatever's taken out for the file cache).
-    oom_buffer_bytes: u64,
+    /// Interval at which we should be fetching memory statistics
+    memory_poll_interval: Duration,

-    // The amount of memory, in bytes, below a proposed new value for
-    // memory.high that the cgroup's memory usage must be for us to downscale
-    //
-    // In other words, we can downscale only when:
-    //
-    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
-    //
-    // TODO: there's some minor issues with this approach -- in particular, that we might have
-    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
-    pub(crate) memory_high_buffer_bytes: u64,
-
-    // The maximum duration, in milliseconds, that we're allowed to pause
-    // the cgroup for while waiting for the autoscaler-agent to upscale us
-    max_upscale_wait: Duration,
-
-    // The required minimum time, in milliseconds, that we must wait before re-freezing
-    // the cgroup while waiting for the autoscaler-agent to upscale us.
-    do_not_freeze_more_often_than: Duration,
-
-    // The amount of memory, in bytes, that we should periodically increase memory.high
-    // by while waiting for the autoscaler-agent to upscale us.
-    //
-    // This exists to avoid the excessive throttling that happens when a cgroup is above its
-    // memory.high for too long. See more here:
-    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
-    memory_high_increase_by_bytes: u64,
-
-    // The period, in milliseconds, at which we should repeatedly increase the value
-    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
-    // is still being hit.
-    //
-    // Technically speaking, this actually serves as a rate limit to moderate responding to
-    // memory.high events, but these are roughly equivalent if the process is still allocating
-    // memory.
-    memory_high_increase_every: Duration,
-}
-
-impl Config {
-    /// Calculate the new value for the cgroups memory.high based on system memory
-    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
-        total_system_mem.saturating_sub(self.oom_buffer_bytes)
-    }
+    /// The number of samples used in constructing aggregated memory statistics
+    memory_history_len: usize,
+    /// The number of most recent samples that will be periodically logged.
+    ///
+    /// Each sample is logged exactly once. Increasing this value means that recent samples will be
+    /// logged less frequently, and vice versa.
+    ///
+    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
+    memory_history_log_interval: usize,
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
-            oom_buffer_bytes: 100 * MiB,
-            memory_high_buffer_bytes: 100 * MiB,
-            // while waiting for upscale, don't freeze for more than 20ms every 1s
-            max_upscale_wait: Duration::from_millis(20),
-            do_not_freeze_more_often_than: Duration::from_millis(1000),
-            // while waiting for upscale, increase memory.high by 10MiB every 25ms
-            memory_high_increase_by_bytes: 10 * MiB,
-            memory_high_increase_every: Duration::from_millis(25),
-        }
-    }
-}
-
-/// Used to represent data that is associated with a certain point in time, such
-/// as an upscale request or memory.high event.
-///
-/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
-/// a unique sequence number. Sequence numbers are monotonically increasing,
-/// allowing us to answer questions like "did this upscale happen after this
-/// memory.high event?" by comparing the sequence numbers of the two events.
-#[derive(Debug, Clone)]
-pub struct Sequenced<T> {
-    seqnum: u64,
-    data: T,
-}
-
-impl<T> Sequenced<T> {
-    pub fn new(data: T) -> Self {
-        Self {
-            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
-            data,
+            memory_poll_interval: Duration::from_millis(100),
+            memory_history_len: 5, // use 500ms of history for decision-making
+            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
        }
    }
 }
@@ -170,74 +47,14 @@ impl<T> Sequenced<T> {
 pub struct CgroupWatcher {
    pub config: Config,

-    /// The sequence number of the last upscale.
-    ///
-    /// If we receive a memory.high event that has a _lower_ sequence number than
-    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
-    /// can safely ignore it.
-    ///
-    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
-    /// use it anyways so that methods take `&self`, not `&mut self`.
-    last_upscale_seqnum: AtomicU64,
-
-    /// A channel on which we send messages to request upscale from the dispatcher.
-    upscale_requester: mpsc::Sender<()>,
-
    /// The actual cgroup we are watching and managing.
    cgroup: cgroups_rs::Cgroup,
 }

-/// Read memory.events for the desired event type.
-///
-/// `path` specifies the path to the desired `memory.events` file.
-/// For more info, see the `memory.events` section of the [kernel docs]
-/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
-fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
-    let contents = fs::read_to_string(path)
-        .with_context(|| format!("failed to read memory.events from {path}"))?;
-
-    // Then contents of the file look like:
-    // low 42
-    // high 101
-    // ...
-    contents
-        .lines()
-        .filter_map(|s| s.split_once(' '))
-        .find(|(e, _)| *e == event.as_str())
-        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
-        .and_then(|(_, count)| {
-            count
-                .parse::<u64>()
-                .with_context(|| format!("failed to parse memory.{event} as u64"))
-        })
-}
-
-/// Create an event stream that produces events whenever the file at the provided
-/// path is modified.
-fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
-    info!("creating file watcher for {path}");
-    let inotify = Inotify::init().context("failed to initialize file watcher")?;
-    inotify
-        .watches()
-        .add(path, WatchMask::MODIFY)
-        .with_context(|| format!("failed to start watching {path}"))?;
-    inotify
-        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
-        // to store one event at a time - if the event gets written over, that's
-        // ok. We still see that there is an event. For more information, see:
-        // https://man7.org/linux/man-pages/man7/inotify.7.html
-        .into_event_stream([0u8; 1024])
-        .context("failed to start inotify event stream")
-}
-
 impl CgroupWatcher {
    /// Create a new `CgroupWatcher`.
    #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(
-        name: String,
-        // A channel on which to send upscale requests
-        upscale_requester: mpsc::Sender<()>,
-    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+    pub fn new(name: String) -> anyhow::Result<Self> {
        // TODO: clarify exactly why we need v2
        // Make sure cgroups v2 (aka unified) are supported
        if !is_cgroup2_unified_mode() {
@@ -245,410 +62,203 @@ impl CgroupWatcher {
        }
        let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);

-        // Start monitoring the cgroup for memory events. In general, for
-        // cgroups v2 (aka unified), metrics are reported in files like
-        // > `/sys/fs/cgroup/{name}/{metric}`
-        // We are looking for `memory.high` events, which are stored in the
-        // file `memory.events`. For more info, see the `memory.events` section
-        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
-        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
-        let memory_events = create_file_watcher(&path)
-            .with_context(|| format!("failed to create event watcher for {path}"))?
-            // This would be nice with with .inspect_err followed by .ok
-            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
-                Ok(high) => Some(high),
-                Err(error) => {
-                    // TODO: Might want to just panic here
-                    warn!(?error, "failed to read high events count from {}", &path);
-                    None
-                }
-            })
-            // Only report the event if the memory.high count increased
-            .filter_map(|high| {
-                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
-                    Some(high)
-                } else {
-                    None
-                }
-            })
-            .map(Sequenced::new);
-
-        let initial_count = get_event_count(
-            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
-            MemoryEvent::High,
-        )?;
-
-        info!(initial_count, "initial memory.high event count");
-
-        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
-        // running in the cgroup before that caused it to be non-zero.
-        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
-
-        Ok((
-            Self {
-                cgroup,
-                upscale_requester,
-                last_upscale_seqnum: AtomicU64::new(0),
-                config: Default::default(),
-            },
-            memory_events,
-        ))
+        Ok(Self {
+            cgroup,
+            config: Default::default(),
+        })
    }

    /// The entrypoint for the `CgroupWatcher`.
    #[tracing::instrument(skip_all)]
-    pub async fn watch<E>(
+    pub async fn watch(
        &self,
-        // These are ~dependency injected~ (fancy, I know) because this function
-        // should never return.
-        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
-        // -> therefore: if we want to stick it in an Arc so many threads can access
-        //    it, methods can never take mutable access.
-        //     - note: we use the Arc strategy so that a) we can call this function
-        //             right here and b) the runner can call the set/get_memory methods
-        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
-        //    we just pass them in here instead of holding them in fields, as that
-        //    would require this method to take &mut self.
-        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
-        events: E,
-    ) -> anyhow::Result<()>
-    where
-        E: Stream<Item = Sequenced<u64>>,
-    {
-        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut last_memory_high_increase_at: Option<Instant> = None;
-        let mut events = pin!(events);
-
-        // Are we waiting to be upscaled? Could be true if we request upscale due
-        // to a memory.high event and it does not arrive in time.
-        let mut waiting_on_upscale = false;
-
-        loop {
-            tokio::select! {
-                upscale = upscales.recv() => {
-                    let Sequenced { seqnum, data } = upscale
-                        .context("failed to listen on upscale notification channel")?;
-                    waiting_on_upscale = false;
-                    last_memory_high_increase_at = None;
-                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-                }
-                event = events.next() => {
-                    let Some(Sequenced { seqnum, .. }) = event else {
-                        bail!("failed to listen for memory.high events")
-                    };
-                    // The memory.high came before our last upscale, so we consider
-                    // it resolved
-                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
-                        info!(
-                            "received memory.high event, but it came before our last upscale -> ignoring it"
-                        );
-                        continue;
-                    }
-
-                    // The memory.high came after our latest upscale. We don't
-                    // want to do anything yet, so peek the next event in hopes
-                    // that it's an upscale.
-                    if let Some(upscale_num) = self
-                        .upscaled(&mut upscales)
-                        .context("failed to check if we were upscaled")?
-                    {
-                        if upscale_num > seqnum {
-                            info!(
-                                "received memory.high event, but it came before our last upscale -> ignoring it"
-                            );
-                            continue;
-                        }
-                    }
-
-                    // If it's been long enough since we last froze, freeze the
-                    // cgroup and request upscale
-                    if wait_to_freeze.is_elapsed() {
-                        info!("received memory.high event -> requesting upscale");
-                        waiting_on_upscale = self
-                            .handle_memory_high_event(&mut upscales)
-                            .await
-                            .context("failed to handle upscale")?;
-                        wait_to_freeze
-                            .as_mut()
-                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
-                        continue;
-                    }
-
-                    // Ok, we can't freeze, just request upscale
-                    if !waiting_on_upscale {
-                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to request upscaling because we got upscaled");
-                            continue;
-                        }
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-                        waiting_on_upscale = true;
-                        continue;
-                    }
-
-                    // Shoot, we can't freeze or and we're still waiting on upscale,
-                    // increase memory.high to reduce throttling
-                    let can_increase_memory_high = match last_memory_high_increase_at {
-                        None => true,
-                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
-                    };
-                    if can_increase_memory_high {
-                        info!(
-                            "received memory.high event, \
-                            but too soon to refreeze and already requested upscale \
-                            -> increasing memory.high"
-                        );
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to increase memory.high because got upscaled");
-                            continue;
-                        }
-
-                        // Request upscale anyways (the agent will handle deduplicating
-                        // requests)
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-
-                        let memory_high =
-                            self.get_memory_high_bytes().context("failed to get memory.high")?;
-                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
-                        info!(
-                            current_high_bytes = memory_high,
-                            new_high_bytes = new_high,
-                            "updating memory.high"
-                        );
-                        self.set_memory_high_bytes(new_high)
-                            .context("failed to set memory.high")?;
-                        last_memory_high_increase_at = Some(Instant::now());
-                        continue;
-                    }
-
-                    info!("received memory.high event, but can't do anything");
-                }
-            };
-        }
-    }
-
-    /// Handle a `memory.high`, returning whether we are still waiting on upscale
-    /// by the time the function returns.
-    ///
-    /// The general plan for handling a `memory.high` event is as follows:
-    /// 1. Freeze the cgroup
-    /// 2. Start a timer for `self.config.max_upscale_wait`
-    /// 3. Request upscale
-    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
-    /// 5. Return whether or not we are still waiting for upscale. If we are,
-    ///    we'll increase the cgroups memory.high to avoid getting oom killed
-    #[tracing::instrument(skip_all)]
-    async fn handle_memory_high_event(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<bool> {
-        // Immediately freeze the cgroup before doing anything else.
-        info!("received memory.high event -> freezing cgroup");
-        self.freeze().context("failed to freeze cgroup")?;
-
-        // We'll use this for logging durations
-        let start_time = Instant::now();
-
-        // Await the upscale until we have to unfreeze
-        let timed =
-            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
-
-        // Request the upscale
-        info!(
-            wait = ?self.config.max_upscale_wait,
-            "sending request for immediate upscaling",
-        );
-        self.upscale_requester
-            .send(())
-            .await
-            .context("failed to request upscale")?;
-
-        let waiting_on_upscale = match timed.await {
-            Ok(Ok(())) => {
-                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
-                false
-            }
-            // **important**: unfreeze the cgroup before ?-reporting the error
-            Ok(Err(e)) => {
-                info!("error waiting for upscale -> thawing cgroup");
-                self.thaw()
-                    .context("failed to thaw cgroup after errored waiting for upscale")?;
-                Err(e.context("failed to await upscale"))?
-            }
-            Err(_) => {
-                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
-                true
-            }
-        };
-
-        info!("thawing cgroup");
-        self.thaw().context("failed to thaw cgroup")?;
-
-        Ok(waiting_on_upscale)
-    }
-
-    /// Checks whether we were just upscaled, returning the upscale's sequence
-    /// number if so.
-    #[tracing::instrument(skip_all)]
-    fn upscaled(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<Option<u64>> {
-        let Sequenced { seqnum, data } = match upscales.try_recv() {
-            Ok(upscale) => upscale,
-            Err(TryRecvError::Empty) => return Ok(None),
-            Err(TryRecvError::Disconnected) => {
-                bail!("upscale notification channel was disconnected")
-            }
-        };
-
-        // Make sure to update the last upscale sequence number
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-        Ok(Some(seqnum))
-    }
-
-    /// Await an upscale event, discarding any `memory.high` events received in
-    /// the process.
-    ///
-    /// This is used in `handle_memory_high_event`, where we need to listen
-    /// for upscales in particular so we know if we can thaw the cgroup early.
-    #[tracing::instrument(skip_all)]
-    async fn await_upscale(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+        updates: watch::Sender<(Instant, MemoryHistory)>,
    ) -> anyhow::Result<()> {
-        let Sequenced { seqnum, .. } = upscales
-            .recv()
-            .await
-            .context("error listening for upscales")?;
+        // this requirement makes the code a bit easier to work with; see the config for more.
+        assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);

-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        Ok(())
-    }
+        let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
+        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+        // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0

-    /// Get the cgroup's name.
-    pub fn path(&self) -> &str {
-        self.cgroup.path()
-    }
-}
+        let mem_controller = self.memory()?;

-// Methods for manipulating the actual cgroup
-impl CgroupWatcher {
-    /// Get a handle on the freezer subsystem.
-    fn freezer(&self) -> anyhow::Result<&FreezerController> {
-        if let Some(Freezer(freezer)) = self
-            .cgroup
-            .subsystems()
-            .iter()
-            .find(|sub| matches!(sub, Freezer(_)))
-        {
-            Ok(freezer)
-        } else {
-            anyhow::bail!("could not find freezer subsystem")
+        // buffer for samples that will be logged. once full, it remains so.
+        let history_log_len = self.config.memory_history_log_interval;
+        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+
+        for t in 0_u64.. {
+            ticker.tick().await;
+
+            let now = Instant::now();
+            let mem = Self::memory_usage(mem_controller);
+
+            let i = t as usize % history_log_len;
+            history_log_buf[i] = mem;
+
+            // We're taking *at most* memory_history_len values; we may be bounded by the total
+            // number of samples that have come in so far.
+            let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
+            // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
+            // that we just inserted a value there, so the end of the iterator will *include* the
+            // value at i, rather than stopping just short of it.
+            let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
+
+            let summary = MemoryHistory {
+                avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
+                    / samples_count as u64,
+                samples_count,
+                samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
+            };
+
+            // Log the current history if it's time to do so. Because `history_log_buf` has length
+            // equal to the logging interval, we can just log the entire buffer every time we set
+            // the last entry, which also means that for this log line, we can ignore that it's a
+            // ring buffer (because all the entries are in order of increasing time).
+            if i == history_log_len - 1 {
+                info!(
+                    history = ?MemoryStatus::debug_slice(&history_log_buf),
+                    summary = ?summary,
+                    "Recent cgroup memory statistics history"
+                );
+            }
+
+            updates
+                .send((now, summary))
+                .context("failed to send MemoryHistory")?;
        }
-    }

-    /// Attempt to freeze the cgroup.
-    pub fn freeze(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .freeze()
-            .context("failed to freeze")
-    }
-
-    /// Attempt to thaw the cgroup.
-    pub fn thaw(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .thaw()
-            .context("failed to thaw")
+        unreachable!()
    }

    /// Get a handle on the memory subsystem.
-    ///
-    /// Note: this method does not require `self.memory_update_lock` because
-    /// getting a handle to the subsystem does not access any of the files we
-    /// care about, such as memory.high and memory.events
    fn memory(&self) -> anyhow::Result<&MemController> {
-        if let Some(Mem(memory)) = self
-            .cgroup
+        self.cgroup
            .subsystems()
            .iter()
-            .find(|sub| matches!(sub, Mem(_)))
-        {
-            Ok(memory)
-        } else {
-            anyhow::bail!("could not find memory subsystem")
-        }
-    }
-
-    /// Get cgroup current memory usage.
-    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
-        Ok(self
-            .memory()
-            .context("failed to get memory subsystem")?
-            .memory_stat()
-            .usage_in_bytes)
-    }
-
-    /// Set cgroup memory.high threshold.
-    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
-    }
-
-    /// Set the cgroup's memory.high to 'max', disabling it.
-    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Max)
-    }
-
-    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
-        self.memory()
-            .context("failed to get memory subsystem")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                low: None,
-                high: Some(value),
-                min: None,
-                max: None,
+            .find_map(|sub| match sub {
+                Subsystem::Mem(c) => Some(c),
+                _ => None,
            })
-            .map_err(anyhow::Error::from)
+            .ok_or_else(|| anyhow!("could not find memory subsystem"))
    }

-    /// Get memory.high threshold.
-    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
-        let high = self
-            .memory()
-            .context("failed to get memory subsystem while getting memory statistics")?
-            .get_mem()
-            .map(|mem| mem.high)
-            .context("failed to get memory statistics from subsystem")?;
-        match high {
-            Some(MaxValue::Max) => Ok(i64::MAX as u64),
-            Some(MaxValue::Value(high)) => Ok(high as u64),
-            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+    /// Given a handle on the memory subsystem, returns the current memory information
+    fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
+        let stat = mem_controller.memory_stat().stat;
+        MemoryStatus {
+            non_reclaimable: stat.active_anon + stat.inactive_anon,
        }
    }
 }
+
+// Helper function for `CgroupWatcher::watch`
+fn ring_buf_recent_values_iter<T>(
+    buf: &[T],
+    last_value_idx: usize,
+    count: usize,
+) -> impl '_ + Iterator<Item = &T> {
+    // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
+    // easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
+    assert!(count <= buf.len());
+
+    buf.iter()
+        // 'cycle' because the values could wrap around
+        .cycle()
+        // with 'cycle', this skip is more like 'offset', and functionally this is
+        // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
+        // careful to avoid underflow, so we pre-add buf.len().
+        // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
+        .skip((buf.len() + last_value_idx + 1 - count) % buf.len())
+        .take(count)
+}
+
+/// Summary of recent memory usage
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryHistory {
+    /// Rolling average of non-reclaimable memory usage samples over the last `history_period`
+    pub avg_non_reclaimable: u64,
+
+    /// The number of samples used to construct this summary
+    pub samples_count: usize,
+    /// Total timespan between the first and last sample used for this summary
+    pub samples_span: Duration,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryStatus {
+    non_reclaimable: u64,
+}
+
+impl MemoryStatus {
+    fn zeroed() -> Self {
+        MemoryStatus { non_reclaimable: 0 }
+    }
+
+    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
+        struct DS<'a>(&'a [MemoryStatus]);
+
+        impl<'a> Debug for DS<'a> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_struct("[MemoryStatus]")
+                    .field(
+                        "non_reclaimable[..]",
+                        &Fields(self.0, |stat: &MemoryStatus| {
+                            BytesToGB(stat.non_reclaimable)
+                        }),
+                    )
+                    .finish()
+            }
+        }
+
+        struct Fields<'a, F>(&'a [MemoryStatus], F);
+
+        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
+            }
+        }
+
+        struct BytesToGB(u64);
+
+        impl Debug for BytesToGB {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.write_fmt(format_args!(
+                    "{:.3}Gi",
+                    self.0 as f64 / (1_u64 << 30) as f64
+                ))
+            }
+        }
+
+        DS(slice)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn ring_buf_iter() {
+        let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+        let values = |offset, count| {
+            super::ring_buf_recent_values_iter(&buf, offset, count)
+                .copied()
+                .collect::<Vec<i32>>()
+        };
+
+        // Boundary conditions: start, end, and entire thing:
+        assert_eq!(values(0, 1), [0]);
+        assert_eq!(values(3, 4), [0, 1, 2, 3]);
+        assert_eq!(values(9, 4), [6, 7, 8, 9]);
+        assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+
+        // "normal" operation: no wraparound
+        assert_eq!(values(7, 4), [4, 5, 6, 7]);
+
+        // wraparound:
+        assert_eq!(values(0, 4), [7, 8, 9, 0]);
+        assert_eq!(values(1, 4), [8, 9, 0, 1]);
+        assert_eq!(values(2, 4), [9, 0, 1, 2]);
+        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
+    }
+}
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,12 +12,10 @@ use futures::{
    stream::{SplitSink, SplitStream},
    SinkExt, StreamExt,
 };
-use tokio::sync::mpsc;
 use tracing::info;

-use crate::cgroup::Sequenced;
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
    PROTOCOL_MIN_VERSION,
 };

@@ -36,13 +34,6 @@ pub struct Dispatcher {
    /// We send messages to the agent through `sink`
    sink: SplitSink<WebSocket, Message>,

-    /// Used to notify the cgroup when we are upscaled.
-    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-
-    /// When the cgroup requests upscale it will send on this channel. In response
-    /// we send an `UpscaleRequst` to the agent.
-    pub(crate) request_upscale_events: mpsc::Receiver<()>,
-
    /// The protocol version we have agreed to use with the agent. This is negotiated
    /// during the creation of the dispatcher, and should be the highest shared protocol
    /// version.
@@ -61,11 +52,7 @@ impl Dispatcher {
    /// 1. Wait for the agent to sent the range of protocols it supports.
    /// 2. Send a protocol version that works for us as well, or an error if there
    ///    is no compatible version.
-    pub async fn new(
-        stream: WebSocket,
-        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-        request_upscale_events: mpsc::Receiver<()>,
-    ) -> anyhow::Result<Self> {
+    pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
        let (mut sink, mut source) = stream.split();

        // Figure out the highest protocol version we both support
@@ -119,22 +106,10 @@ impl Dispatcher {
        Ok(Self {
            sink,
            source,
-            notify_upscale_events,
-            request_upscale_events,
            proto_version: highest_shared_version,
        })
    }

-    /// Notify the cgroup manager that we have received upscale and wait for
-    /// the acknowledgement.
-    #[tracing::instrument(skip_all, fields(?resources))]
-    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
-        self.notify_upscale_events
-            .send(resources)
-            .await
-            .context("failed to send resources and oneshot sender across channel")
-    }
-
    /// Send a message to the agent.
    ///
    /// Although this function is small, it has one major benefit: it is the only
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -4,19 +4,17 @@
 //! This is the "Monitor" part of the monitor binary and is the main entrypoint for
 //! all functionality.

-use std::sync::Arc;
+use std::fmt::Debug;
 use std::time::{Duration, Instant};
-use std::{fmt::Debug, mem};

 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
-use tokio::sync::broadcast;
-use tokio::sync::mpsc;
+use tokio::sync::{broadcast, watch};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};

-use crate::cgroup::{CgroupWatcher, Sequenced};
+use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -28,7 +26,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
 pub struct Runner {
    config: Config,
    filecache: Option<FileCacheState>,
-    cgroup: Option<Arc<CgroupWatcher>>,
+    cgroup: Option<CgroupState>,
    dispatcher: Dispatcher,

    /// We "mint" new message ids by incrementing this counter and taking the value.
@@ -45,6 +43,14 @@ pub struct Runner {
    kill: broadcast::Receiver<()>,
 }

+#[derive(Debug)]
+struct CgroupState {
+    watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
+    /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
+    /// requests.
+    threshold: u64,
+}
+
 /// Configuration for a `Runner`
 #[derive(Debug)]
 pub struct Config {
@@ -62,16 +68,56 @@ pub struct Config {
    /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,
+
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
+    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
+    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
+    /// threshold.
+    ///
+    /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
+    /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
+    /// memory.
+    ///
+    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
+    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
+    /// memory for the file cache).
+    cgroup_min_overhead_fraction: f64,
+
+    cgroup_downscale_threshold_buffer_bytes: u64,
 }

 impl Default for Config {
    fn default() -> Self {
        Self {
            sys_buffer_bytes: 100 * MiB,
+            cgroup_min_overhead_fraction: 0.15,
+            cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
        }
    }
 }

+impl Config {
+    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
+        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
+        // and thus be non-reclaimable, so we should allow for additional memory usage.
+        //
+        // If the file cache sits on disk, our desired stable system state is for it to be fully
+        // page cached (its contents should only be paged to/from disk in situations where we can't
+        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
+        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
+        // out the file cache.
+        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
+
+        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
+        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
+        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
+        // remaining above the threshold.
+        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
+
+        memory_remaining_for_cgroup.min(max_threshold)
+    }
+}
+
 impl Runner {
    /// Create a new monitor.
    #[tracing::instrument(skip_all, fields(?config, ?args))]
@@ -87,12 +133,7 @@ impl Runner {
            "invalid monitor Config: sys_buffer_bytes cannot be 0"
        );

-        // *NOTE*: the dispatcher and cgroup manager talk through these channels
-        // so make sure they each get the correct half, nothing is droppped, etc.
-        let (notified_send, notified_recv) = mpsc::channel(1);
-        let (requesting_send, requesting_recv) = mpsc::channel(1);
-
-        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+        let dispatcher = Dispatcher::new(ws)
            .await
            .context("error creating new dispatcher")?;

@@ -106,54 +147,10 @@ impl Runner {
            kill,
        };

-        // If we have both the cgroup and file cache integrations enabled, it's possible for
-        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
-        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
-        // we *do* still want to determine the file cache size before setting the cgroup's
-        // memory.high, so it's not as simple as just swapping the order.
-        //
-        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
-        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
-        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
-        // of a hacky solution, but helps with reliability.
-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
-
-            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
-                .context("failed to create cgroup manager")?;
-
-            info!("temporarily unsetting memory.high");
-
-            // Temporarily un-set cgroup memory.high; see above.
-            cgroup
-                .unset_memory_high()
-                .context("failed to unset memory.high")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            let cgroup_clone = Arc::clone(&cgroup);
-            spawn_with_cancel(
-                token.clone(),
-                |_| error!("cgroup watcher terminated"),
-                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
-            );
-
-            state.cgroup = Some(cgroup);
-        } else {
-            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
-            // This allows us to poll it in `Monitor::run` regardless of whether we
-            // are managing a cgroup or not. If we don't forget it, all receives will
-            // immediately return an error because the sender is droped and it will
-            // claim all select! statements, effectively turning `Monitor::run` into
-            // `loop { fail to receive }`.
-            mem::forget(requesting_send);
-        }
-
-        let mut file_cache_reserved_bytes = 0;
        let mem = get_total_system_memory();

+        let mut file_cache_disk_size = 0;
+
        // We need to process file cache initialization before cgroup initialization, so that the memory
        // allocated to the file cache is appropriately taken into account when we decide the cgroup's
        // memory limits.
@@ -164,7 +161,7 @@ impl Runner {
                false => FileCacheConfig::default_in_memory(),
            };

-            let mut file_cache = FileCacheState::new(connstr, config, token)
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
                .context("failed to create file cache")?;

@@ -189,23 +186,40 @@ impl Runner {
            if actual_size != new_size {
                info!("file cache size actually got set to {actual_size}")
            }
-            // Mark the resources given to the file cache as reserved, but only if it's in memory.
-            if !args.file_cache_on_disk {
-                file_cache_reserved_bytes = actual_size;
+
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
            }

            state.filecache = Some(file_cache);
        }

-        if let Some(cgroup) = &state.cgroup {
-            let available = mem - file_cache_reserved_bytes;
-            let value = cgroup.config.calculate_memory_high_value(available);
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");

-            info!(value, "setting memory.high");
+            let cgroup =
+                CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;

-            cgroup
-                .set_memory_high_bytes(value)
-                .context("failed to set cgroup memory.high")?;
+            let init_value = cgroup::MemoryHistory {
+                avg_non_reclaimable: 0,
+                samples_count: 0,
+                samples_span: Duration::ZERO,
+            };
+            let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup.watch(hist_tx).await
+            });
+
+            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
+            info!(threshold, "set initial cgroup threshold",);
+
+            state.cgroup = Some(CgroupState {
+                watcher: hist_rx,
+                threshold,
+            });
        }

        Ok(state)
@@ -225,28 +239,51 @@ impl Runner {

        let requested_mem = target.mem;
        let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_mem_usage = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
            .filecache
            .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
-        let mut new_cgroup_mem_high = 0;
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
        if let Some(cgroup) = &self.cgroup {
-            new_cgroup_mem_high = cgroup
+            let (last_time, last_history) = *cgroup.watcher.borrow();
+
+            // NB: The ordering of these conditions is intentional. During startup, we should deny
+            // downscaling until we have enough information to determine that it's safe to do so
+            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
+            // received any information, we should *fail* instead of just denying downscaling.
+            //
+            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
+            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
+            // OR if we haven't received metrics *recently enough*.
+            //
+            // TODO: make the duration here configurable.
+            if last_time.elapsed() > Duration::from_secs(5) {
+                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
+            } else if last_history.samples_count <= 1 {
+                let status = "haven't received enough cgroup memory stats yet";
+                info!(status, "discontinuing downscale");
+                return Ok((false, status.to_owned()));
+            }
+
+            let new_threshold = self
                .config
-                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);

-            let current = cgroup
-                .current_memory_usage()
-                .context("failed to fetch cgroup memory")?;
+            let current = last_history.avg_non_reclaimable;

-            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+            if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
                let status = format!(
-                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
-                    "calculated memory.high too low",
-                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
+                    "calculated memory threshold too low",
+                    bytes_to_mebibytes(new_threshold),
                    bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                    bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
                );

                info!(status, "discontinuing downscale");
@@ -257,14 +294,14 @@ impl Runner {

        // The downscaling has been approved. Downscale the file cache, then the cgroup.
        let mut status = vec![];
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
        if let Some(file_cache) = &mut self.filecache {
            let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_mem_usage)
+                .set_file_cache_size(expected_file_cache_size)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
            }
            let message = format!(
                "set file cache size to {} MiB (in memory = {})",
@@ -275,24 +312,18 @@ impl Runner {
            status.push(message);
        }

-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-
-            if file_cache_mem_usage != expected_file_cache_mem_usage {
-                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
-            }
-
-            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
-            // since it is properly initialized in the previous cgroup if let block
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);

            let message = format!(
-                "set cgroup memory.high to {} MiB, of new max {} MiB",
-                bytes_to_mebibytes(new_cgroup_mem_high),
-                bytes_to_mebibytes(available_memory)
+                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
            );
+            cgroup.threshold = new_threshold;
            info!("downscale: {message}");
            status.push(message);
        }
@@ -313,8 +344,7 @@ impl Runner {
        let new_mem = resources.mem;
        let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);

-        // Get the file cache's expected contribution to the memory usage
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
        if let Some(file_cache) = &mut self.filecache {
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
@@ -327,8 +357,8 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
            }

            if actual_usage != expected_usage {
@@ -340,18 +370,18 @@ impl Runner {
            }
        }

-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
+
            info!(
-                target = bytes_to_mebibytes(new_cgroup_mem_high),
-                total = bytes_to_mebibytes(new_mem),
-                name = cgroup.path(),
-                "updating cgroup memory.high",
+                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
            );
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+            cgroup.threshold = new_threshold;
        }

        Ok(())
@@ -369,10 +399,6 @@ impl Runner {
                self.handle_upscale(granted)
                    .await
                    .context("failed to handle upscale")?;
-                self.dispatcher
-                    .notify_upscale(Sequenced::new(granted))
-                    .await
-                    .context("failed to notify notify cgroup of upscale")?;
                Ok(Some(OutboundMsg::new(
                    OutboundMsgKind::UpscaleConfirmation {},
                    id,
@@ -416,33 +442,53 @@ impl Runner {
                        Err(e) => bail!("failed to receive kill signal: {e}")
                    }
                }
-                // we need to propagate an upscale request
-                request = self.dispatcher.request_upscale_events.recv() => {
-                    if request.is_none() {
-                        bail!("failed to listen for upscale event from cgroup")
+
+                // New memory stats from the cgroup, *may* need to request upscaling, if we've
+                // exceeded the threshold
+                result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
+                    result.context("failed to receive from cgroup memory stats watcher")?;
+
+                    let cgroup = self.cgroup.as_ref().unwrap();
+
+                    let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
+
+                    // If we haven't exceeded the threshold, then we're all ok
+                    if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
+                        continue;
                    }

-                    // If it's been less than 1 second since the last time we requested upscaling,
-                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
-                    // ~1k times per second).
+                    // Otherwise, we generally want upscaling. But, if it's been less than 1 second
+                    // since the last time we requested upscaling, ignore the event, to avoid
+                    // spamming the agent.
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
+                            info!(
+                                elapsed_millis = elapsed.as_millis(),
+                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                                threshold = bytes_to_mebibytes(cgroup.threshold),
+                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
+                            );
                            continue;
                        }
                    }

                    self.last_upscale_request_at = Some(Instant::now());

-                    info!("cgroup asking for upscale; forwarding request");
+                    info!(
+                        avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                        threshold = bytes_to_mebibytes(cgroup.threshold),
+                        "cgroup memory stats are high enough to upscale, requesting upscale",
+                    );
+
                    self.counter += 2; // Increment, preserving parity (i.e. keep the
                                       // counter odd). See the field comment for more.
                    self.dispatcher
                        .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
                        .await
                        .context("failed to send message")?;
-                }
+                },
+
                // there is a message from the agent
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
@@ -470,11 +516,14 @@ impl Runner {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
                                    Err(e) => {
-                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
+                                        // use {:#} for our logging because the display impl only
+                                        // gives the outermost cause, and the debug impl
+                                        // pretty-prints the error, whereas {:#} contains all the
+                                        // causes, but is compact (no newlines).
+                                        warn!(error = format!("{e:#}"), "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
-                                                error
+                                                error: e.to_string(),
                                            },
                                            message.id
                                        )
--- a/libs/walproposer/Cargo.toml
+++ b/libs/walproposer/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "walproposer"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+utils.workspace = true
+postgres_ffi.workspace = true
+
+workspace_hack.workspace = true
+
+[build-dependencies]
+anyhow.workspace = true
+bindgen.workspace = true
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -0,0 +1 @@
+#include "walproposer.h"
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -0,0 +1,113 @@
+use std::{env, path::PathBuf, process::Command};
+
+use anyhow::{anyhow, Context};
+use bindgen::CargoCallbacks;
+
+fn main() -> anyhow::Result<()> {
+    // Tell cargo to invalidate the built crate whenever the wrapper changes
+    println!("cargo:rerun-if-changed=bindgen_deps.h");
+
+    // Finding the location of built libraries and Postgres C headers:
+    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
+    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
+    let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
+        postgres_install_dir.into()
+    } else {
+        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
+    };
+
+    let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
+    let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
+    let walproposer_lib_search_str = walproposer_lib_dir
+        .to_str()
+        .ok_or(anyhow!("Bad non-UTF path"))?;
+
+    let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
+    let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
+    let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
+
+    println!("cargo:rustc-link-lib=static=pgport");
+    println!("cargo:rustc-link-lib=static=pgcommon");
+    println!("cargo:rustc-link-lib=static=walproposer");
+    println!("cargo:rustc-link-search={walproposer_lib_search_str}");
+
+    let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
+    let inc_server_path: String = if pg_config_bin.exists() {
+        let output = Command::new(pg_config_bin)
+            .arg("--includedir-server")
+            .output()
+            .context("failed to execute `pg_config --includedir-server`")?;
+
+        if !output.status.success() {
+            panic!("`pg_config --includedir-server` failed")
+        }
+
+        String::from_utf8(output.stdout)
+            .context("pg_config output is not UTF-8")?
+            .trim_end()
+            .into()
+    } else {
+        let server_path = pg_install_abs
+            .join("v16")
+            .join("include")
+            .join("postgresql")
+            .join("server")
+            .into_os_string();
+        server_path
+            .into_string()
+            .map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
+    };
+
+    // The bindgen::Builder is the main entry point
+    // to bindgen, and lets you build up options for
+    // the resulting bindings.
+    let bindings = bindgen::Builder::default()
+        // The input header we would like to generate
+        // bindings for.
+        .header("bindgen_deps.h")
+        // Tell cargo to invalidate the built crate whenever any of the
+        // included header files changed.
+        .parse_callbacks(Box::new(CargoCallbacks))
+        .allowlist_type("WalProposer")
+        .allowlist_type("WalProposerConfig")
+        .allowlist_type("walproposer_api")
+        .allowlist_function("WalProposerCreate")
+        .allowlist_function("WalProposerStart")
+        .allowlist_function("WalProposerBroadcast")
+        .allowlist_function("WalProposerPoll")
+        .allowlist_function("WalProposerFree")
+        .allowlist_var("DEBUG5")
+        .allowlist_var("DEBUG4")
+        .allowlist_var("DEBUG3")
+        .allowlist_var("DEBUG2")
+        .allowlist_var("DEBUG1")
+        .allowlist_var("LOG")
+        .allowlist_var("INFO")
+        .allowlist_var("NOTICE")
+        .allowlist_var("WARNING")
+        .allowlist_var("ERROR")
+        .allowlist_var("FATAL")
+        .allowlist_var("PANIC")
+        .allowlist_var("WPEVENT")
+        .allowlist_var("WL_LATCH_SET")
+        .allowlist_var("WL_SOCKET_READABLE")
+        .allowlist_var("WL_SOCKET_WRITEABLE")
+        .allowlist_var("WL_TIMEOUT")
+        .allowlist_var("WL_SOCKET_CLOSED")
+        .allowlist_var("WL_SOCKET_MASK")
+        .clang_arg("-DWALPROPOSER_LIB")
+        .clang_arg(format!("-I{pgxn_neon}"))
+        .clang_arg(format!("-I{inc_server_path}"))
+        // Finish the builder and generate the bindings.
+        .generate()
+        // Unwrap the Result and panic on failure.
+        .expect("Unable to generate bindings");
+
+    // Write the bindings to the $OUT_DIR/bindings.rs file.
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
+    bindings
+        .write_to_file(out_path)
+        .expect("Couldn't write bindings!");
+
+    Ok(())
+}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -0,0 +1,455 @@
+#![allow(dead_code)]
+
+use std::ffi::CStr;
+use std::ffi::CString;
+
+use crate::bindings::uint32;
+use crate::bindings::walproposer_api;
+use crate::bindings::PGAsyncReadResult;
+use crate::bindings::PGAsyncWriteResult;
+use crate::bindings::Safekeeper;
+use crate::bindings::Size;
+use crate::bindings::StringInfoData;
+use crate::bindings::TimeLineID;
+use crate::bindings::TimestampTz;
+use crate::bindings::WalProposer;
+use crate::bindings::WalProposerConnStatusType;
+use crate::bindings::WalProposerConnectPollStatusType;
+use crate::bindings::WalProposerExecStatusType;
+use crate::bindings::WalproposerShmemState;
+use crate::bindings::XLogRecPtr;
+use crate::walproposer::ApiImpl;
+use crate::walproposer::WaitResult;
+
+extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_shmem_state()
+    }
+}
+
+extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).start_streaming(startpos)
+    }
+}
+
+extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_flush_rec_ptr()
+    }
+}
+
+extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_current_timestamp()
+    }
+}
+
+extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let msg = (*api).conn_error_message(&mut (*sk));
+        let msg = CString::new(msg).unwrap();
+        // TODO: fix leaking error message
+        msg.into_raw()
+    }
+}
+
+extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_status(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_start(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_connect_poll(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
+    let query = unsafe { CStr::from_ptr(query) };
+    let query = query.to_str().unwrap();
+
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_send_query(&mut (*sk), query)
+    }
+}
+
+extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_get_query_result(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_flush(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_finish(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_finish(&mut (*sk))
+    }
+}
+
+extern "C" fn conn_async_read(
+    sk: *mut Safekeeper,
+    buf: *mut *mut ::std::os::raw::c_char,
+    amount: *mut ::std::os::raw::c_int,
+) -> PGAsyncReadResult {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let (res, result) = (*api).conn_async_read(&mut (*sk));
+
+        // This function has guarantee that returned buf will be valid until
+        // the next call. So we can store a Vec in each Safekeeper and reuse
+        // it on the next call.
+        let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
+
+        inbuf.clear();
+        inbuf.extend_from_slice(res);
+
+        // Put a Vec back to sk->inbuf and return data ptr.
+        *buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
+        *amount = res.len() as i32;
+
+        result
+    }
+}
+
+extern "C" fn conn_async_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> PGAsyncWriteResult {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_async_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn conn_blocking_write(
+    sk: *mut Safekeeper,
+    buf: *const ::std::os::raw::c_void,
+    size: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts(buf as *const u8, size);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).conn_blocking_write(&mut (*sk), buf)
+    }
+}
+
+extern "C" fn recovery_download(
+    sk: *mut Safekeeper,
+    _timeline: TimeLineID,
+    startpos: XLogRecPtr,
+    endpos: XLogRecPtr,
+) -> bool {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).recovery_download(&mut (*sk), startpos, endpos)
+    }
+}
+
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+) {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_read(&mut (*sk), buf, startptr)
+    }
+}
+
+extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_reader_allocate(&mut (*sk));
+    }
+}
+
+extern "C" fn free_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).free_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn init_event_set(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).init_event_set(&mut (*wp));
+    }
+}
+
+extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).update_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).add_safekeeper_event_set(&mut (*sk), events);
+    }
+}
+
+extern "C" fn wait_event_set(
+    wp: *mut WalProposer,
+    timeout: ::std::os::raw::c_long,
+    event_sk: *mut *mut Safekeeper,
+    events: *mut uint32,
+) -> ::std::os::raw::c_int {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let result = (*api).wait_event_set(&mut (*wp), timeout);
+        match result {
+            WaitResult::Latch => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_LATCH_SET;
+                1
+            }
+            WaitResult::Timeout => {
+                *event_sk = std::ptr::null_mut();
+                *events = crate::bindings::WL_TIMEOUT;
+                0
+            }
+            WaitResult::Network(sk, event_mask) => {
+                *event_sk = sk;
+                *events = event_mask;
+                1
+            }
+        }
+    }
+}
+
+extern "C" fn strong_random(
+    wp: *mut WalProposer,
+    buf: *mut ::std::os::raw::c_void,
+    len: usize,
+) -> bool {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).strong_random(buf)
+    }
+}
+
+extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).get_redo_start_lsn()
+    }
+}
+
+extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).finish_sync_safekeepers(lsn)
+    }
+}
+
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
+    }
+}
+
+extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).confirm_wal_streamed(&mut (*wp), lsn)
+    }
+}
+
+extern "C" fn log_internal(
+    wp: *mut WalProposer,
+    level: ::std::os::raw::c_int,
+    line: *const ::std::os::raw::c_char,
+) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        let line = CStr::from_ptr(line);
+        let line = line.to_str().unwrap();
+        (*api).log_internal(&mut (*wp), Level::from(level as u32), line)
+    }
+}
+
+extern "C" fn after_election(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).after_election(&mut (*wp))
+    }
+}
+
+#[derive(Debug)]
+pub enum Level {
+    Debug5,
+    Debug4,
+    Debug3,
+    Debug2,
+    Debug1,
+    Log,
+    Info,
+    Notice,
+    Warning,
+    Error,
+    Fatal,
+    Panic,
+    WPEvent,
+}
+
+impl Level {
+    pub fn from(elevel: u32) -> Level {
+        use crate::bindings::*;
+
+        match elevel {
+            DEBUG5 => Level::Debug5,
+            DEBUG4 => Level::Debug4,
+            DEBUG3 => Level::Debug3,
+            DEBUG2 => Level::Debug2,
+            DEBUG1 => Level::Debug1,
+            LOG => Level::Log,
+            INFO => Level::Info,
+            NOTICE => Level::Notice,
+            WARNING => Level::Warning,
+            ERROR => Level::Error,
+            FATAL => Level::Fatal,
+            PANIC => Level::Panic,
+            WPEVENT => Level::WPEvent,
+            _ => panic!("unknown log level {}", elevel),
+        }
+    }
+}
+
+pub(crate) fn create_api() -> walproposer_api {
+    walproposer_api {
+        get_shmem_state: Some(get_shmem_state),
+        start_streaming: Some(start_streaming),
+        get_flush_rec_ptr: Some(get_flush_rec_ptr),
+        get_current_timestamp: Some(get_current_timestamp),
+        conn_error_message: Some(conn_error_message),
+        conn_status: Some(conn_status),
+        conn_connect_start: Some(conn_connect_start),
+        conn_connect_poll: Some(conn_connect_poll),
+        conn_send_query: Some(conn_send_query),
+        conn_get_query_result: Some(conn_get_query_result),
+        conn_flush: Some(conn_flush),
+        conn_finish: Some(conn_finish),
+        conn_async_read: Some(conn_async_read),
+        conn_async_write: Some(conn_async_write),
+        conn_blocking_write: Some(conn_blocking_write),
+        recovery_download: Some(recovery_download),
+        wal_read: Some(wal_read),
+        wal_reader_allocate: Some(wal_reader_allocate),
+        free_event_set: Some(free_event_set),
+        init_event_set: Some(init_event_set),
+        update_event_set: Some(update_event_set),
+        add_safekeeper_event_set: Some(add_safekeeper_event_set),
+        wait_event_set: Some(wait_event_set),
+        strong_random: Some(strong_random),
+        get_redo_start_lsn: Some(get_redo_start_lsn),
+        finish_sync_safekeepers: Some(finish_sync_safekeepers),
+        process_safekeeper_feedback: Some(process_safekeeper_feedback),
+        confirm_wal_streamed: Some(confirm_wal_streamed),
+        log_internal: Some(log_internal),
+        after_election: Some(after_election),
+    }
+}
+
+impl std::fmt::Display for Level {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+/// Take ownership of `Vec<u8>` from StringInfoData.
+pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
+    if pg.data.is_null() {
+        return None;
+    }
+
+    let ptr = pg.data as *mut u8;
+    let length = pg.len as usize;
+    let capacity = pg.maxlen as usize;
+
+    pg.data = std::ptr::null_mut();
+    pg.len = 0;
+    pg.maxlen = 0;
+
+    unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
+}
+
+/// Store `Vec<u8>` in StringInfoData.
+fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
+    let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
+    let length = vec.len();
+    let capacity = vec.capacity();
+
+    assert!(pg.data.is_null());
+
+    pg.data = ptr;
+    pg.len = length as i32;
+    pg.maxlen = capacity as i32;
+
+    std::mem::forget(vec);
+
+    ptr
+}
--- a/libs/walproposer/src/lib.rs
+++ b/libs/walproposer/src/lib.rs
@@ -0,0 +1,14 @@
+pub mod bindings {
+    #![allow(non_upper_case_globals)]
+    #![allow(non_camel_case_types)]
+    #![allow(non_snake_case)]
+    // bindgen creates some unsafe code with no doc comments.
+    #![allow(clippy::missing_safety_doc)]
+    // noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+    #![allow(clippy::useless_transmute)]
+
+    include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+}
+
+pub mod api_bindings;
+pub mod walproposer;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -0,0 +1,485 @@
+use std::ffi::CString;
+
+use postgres_ffi::WAL_SEGMENT_SIZE;
+use utils::id::TenantTimelineId;
+
+use crate::{
+    api_bindings::{create_api, take_vec_u8, Level},
+    bindings::{
+        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
+        WalProposerStart,
+    },
+};
+
+/// Rust high-level wrapper for C walproposer API. Many methods are not required
+/// for simple cases, hence todo!() in default implementations.
+///
+/// Refer to `pgxn/neon/walproposer.h` for documentation.
+pub trait ApiImpl {
+    fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
+        todo!()
+    }
+
+    fn start_streaming(&self, _startpos: u64) {
+        todo!()
+    }
+
+    fn get_flush_rec_ptr(&self) -> u64 {
+        todo!()
+    }
+
+    fn get_current_timestamp(&self) -> i64 {
+        todo!()
+    }
+
+    fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
+        todo!()
+    }
+
+    fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
+        todo!()
+    }
+
+    fn conn_connect_start(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_connect_poll(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerConnectPollStatusType {
+        todo!()
+    }
+
+    fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
+        todo!()
+    }
+
+    fn conn_get_query_result(
+        &self,
+        _sk: &mut Safekeeper,
+    ) -> crate::bindings::WalProposerExecStatusType {
+        todo!()
+    }
+
+    fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
+        todo!()
+    }
+
+    fn conn_finish(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+        todo!()
+    }
+
+    fn conn_async_write(
+        &self,
+        _sk: &mut Safekeeper,
+        _buf: &[u8],
+    ) -> crate::bindings::PGAsyncWriteResult {
+        todo!()
+    }
+
+    fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
+        todo!()
+    }
+
+    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
+        todo!()
+    }
+
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
+        todo!()
+    }
+
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
+    fn free_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn init_event_set(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+
+    fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
+        todo!()
+    }
+
+    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
+        todo!()
+    }
+
+    fn strong_random(&self, _buf: &mut [u8]) -> bool {
+        todo!()
+    }
+
+    fn get_redo_start_lsn(&self) -> u64 {
+        todo!()
+    }
+
+    fn finish_sync_safekeepers(&self, _lsn: u64) {
+        todo!()
+    }
+
+    fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
+        todo!()
+    }
+
+    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
+        todo!()
+    }
+
+    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
+        todo!()
+    }
+
+    fn after_election(&self, _wp: &mut WalProposer) {
+        todo!()
+    }
+}
+
+pub enum WaitResult {
+    Latch,
+    Timeout,
+    Network(*mut Safekeeper, u32),
+}
+
+pub struct Config {
+    /// Tenant and timeline id
+    pub ttid: TenantTimelineId,
+    /// List of safekeepers in format `host:port`
+    pub safekeepers_list: Vec<String>,
+    /// Safekeeper reconnect timeout in milliseconds
+    pub safekeeper_reconnect_timeout: i32,
+    /// Safekeeper connection timeout in milliseconds
+    pub safekeeper_connection_timeout: i32,
+    /// walproposer mode, finish when all safekeepers are synced or subscribe
+    /// to WAL streaming
+    pub sync_safekeepers: bool,
+}
+
+/// WalProposer main struct. C methods are reexported as Rust functions.
+pub struct Wrapper {
+    wp: *mut WalProposer,
+    _safekeepers_list_vec: Vec<u8>,
+}
+
+impl Wrapper {
+    pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
+        let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
+            .unwrap()
+            .into_raw();
+        let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
+            .unwrap()
+            .into_raw();
+
+        let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
+            .unwrap()
+            .into_bytes_with_nul();
+        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+
+        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
+
+        let c_config = WalProposerConfig {
+            neon_tenant,
+            neon_timeline,
+            safekeepers_list,
+            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
+            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
+            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
+            syncSafekeepers: config.sync_safekeepers,
+            systemId: 0,
+            pgTimeline: 1,
+            callback_data,
+        };
+        let c_config = Box::into_raw(Box::new(c_config));
+
+        let api = create_api();
+        let wp = unsafe { WalProposerCreate(c_config, api) };
+        Wrapper {
+            wp,
+            _safekeepers_list_vec: safekeepers_list_vec,
+        }
+    }
+
+    pub fn start(&self) {
+        unsafe { WalProposerStart(self.wp) }
+    }
+}
+
+impl Drop for Wrapper {
+    fn drop(&mut self) {
+        unsafe {
+            let config = (*self.wp).config;
+            drop(Box::from_raw(
+                (*config).callback_data as *mut Box<dyn ApiImpl>,
+            ));
+            drop(CString::from_raw((*config).neon_tenant));
+            drop(CString::from_raw((*config).neon_timeline));
+            drop(Box::from_raw(config));
+
+            for i in 0..(*self.wp).n_safekeepers {
+                let sk = &mut (*self.wp).safekeeper[i as usize];
+                take_vec_u8(&mut sk.inbuf);
+            }
+
+            WalProposerFree(self.wp);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{
+        cell::Cell,
+        sync::{atomic::AtomicUsize, mpsc::sync_channel},
+    };
+
+    use utils::id::TenantTimelineId;
+
+    use crate::{api_bindings::Level, walproposer::Wrapper};
+
+    use super::ApiImpl;
+
+    #[derive(Clone, Copy, Debug)]
+    struct WaitEventsData {
+        sk: *mut crate::bindings::Safekeeper,
+        event_mask: u32,
+    }
+
+    struct MockImpl {
+        // data to return from wait_event_set
+        wait_events: Cell<WaitEventsData>,
+        // walproposer->safekeeper messages
+        expected_messages: Vec<Vec<u8>>,
+        expected_ptr: AtomicUsize,
+        // safekeeper->walproposer messages
+        safekeeper_replies: Vec<Vec<u8>>,
+        replies_ptr: AtomicUsize,
+        // channel to send LSN to the main thread
+        sync_channel: std::sync::mpsc::SyncSender<u64>,
+    }
+
+    impl MockImpl {
+        fn check_walproposer_msg(&self, msg: &[u8]) {
+            let ptr = self
+                .expected_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.expected_messages.len() {
+                panic!("unexpected message from walproposer");
+            }
+
+            let expected_msg = &self.expected_messages[ptr];
+            assert_eq!(msg, expected_msg.as_slice());
+        }
+
+        fn next_safekeeper_reply(&self) -> &[u8] {
+            let ptr = self
+                .replies_ptr
+                .fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+
+            if ptr >= self.safekeeper_replies.len() {
+                panic!("no more safekeeper replies");
+            }
+
+            &self.safekeeper_replies[ptr]
+        }
+    }
+
+    impl ApiImpl for MockImpl {
+        fn get_current_timestamp(&self) -> i64 {
+            println!("get_current_timestamp");
+            0
+        }
+
+        fn conn_status(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnStatusType {
+            println!("conn_status");
+            crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
+        }
+
+        fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("conn_connect_start");
+        }
+
+        fn conn_connect_poll(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerConnectPollStatusType {
+            println!("conn_connect_poll");
+            crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
+        }
+
+        fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
+            println!("conn_send_query: {}", query);
+            true
+        }
+
+        fn conn_get_query_result(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> crate::bindings::WalProposerExecStatusType {
+            println!("conn_get_query_result");
+            crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
+        }
+
+        fn conn_async_read(
+            &self,
+            _: &mut crate::bindings::Safekeeper,
+        ) -> (&[u8], crate::bindings::PGAsyncReadResult) {
+            println!("conn_async_read");
+            let reply = self.next_safekeeper_reply();
+            println!("conn_async_read result: {:?}", reply);
+            (
+                reply,
+                crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
+            )
+        }
+
+        fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
+            println!("conn_blocking_write: {:?}", buf);
+            self.check_walproposer_msg(buf);
+            true
+        }
+
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("wal_reader_allocate")
+        }
+
+        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("free_event_set")
+        }
+
+        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("init_event_set")
+        }
+
+        fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "update_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
+            println!(
+                "add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
+                sk as *mut crate::bindings::Safekeeper, event_mask
+            );
+            self.wait_events.set(WaitEventsData { sk, event_mask });
+        }
+
+        fn wait_event_set(
+            &self,
+            _: &mut crate::bindings::WalProposer,
+            timeout_millis: i64,
+        ) -> super::WaitResult {
+            let data = self.wait_events.get();
+            println!(
+                "wait_event_set, timeout_millis={}, res={:?}",
+                timeout_millis, data
+            );
+            super::WaitResult::Network(data.sk, data.event_mask)
+        }
+
+        fn strong_random(&self, buf: &mut [u8]) -> bool {
+            println!("strong_random");
+            buf.fill(0);
+            true
+        }
+
+        fn finish_sync_safekeepers(&self, lsn: u64) {
+            self.sync_channel.send(lsn).unwrap();
+            panic!("sync safekeepers finished at lsn={}", lsn);
+        }
+
+        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
+            println!("walprop_log[{}] {}", level, msg);
+        }
+
+        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
+            println!("after_election");
+        }
+    }
+
+    /// Test that walproposer can successfully connect to safekeeper and finish
+    /// sync_safekeepers. API is mocked in MockImpl.
+    ///
+    /// Run this test with valgrind to detect leaks:
+    /// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
+    #[test]
+    fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
+        let ttid = TenantTimelineId::new(
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+            "9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
+        );
+
+        let (sender, receiver) = sync_channel(1);
+
+        let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
+            wait_events: Cell::new(WaitEventsData {
+                sk: std::ptr::null_mut(),
+                event_mask: 0,
+            }),
+            expected_messages: vec![
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
+                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
+                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
+                ],
+                // VoteRequest(VoteRequest { term: 3 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            expected_ptr: AtomicUsize::new(0),
+            safekeeper_replies: vec![
+                // Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
+                vec![
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                ],
+                // VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
+                vec![
+                    118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
+                    5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                    0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
+                ],
+            ],
+            replies_ptr: AtomicUsize::new(0),
+            sync_channel: sender,
+        });
+        let config = crate::walproposer::Config {
+            ttid,
+            safekeepers_list: vec!["localhost:5000".to_string()],
+            safekeeper_reconnect_timeout: 1000,
+            safekeeper_connection_timeout: 10000,
+            sync_safekeepers: true,
+        };
+
+        let wp = Wrapper::new(my_impl, config);
+
+        // walproposer will panic when it finishes sync_safekeepers
+        std::panic::catch_unwind(|| wp.start()).unwrap_err();
+        // validate the resulting LSN
+        assert_eq!(receiver.recv()?, 1337);
+        Ok(())
+        // drop() will free up resources here
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -17,6 +17,8 @@ async-stream.workspace = true
 async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
+camino.workspace = true
+camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
 close_fds.workspace = true
@@ -80,7 +82,6 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-tempfile.workspace = true

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -11,10 +11,7 @@ use std::sync::{Arc, Barrier};

 use bytes::{Buf, Bytes};
 use pageserver::{
-    config::PageServerConf,
-    repository::Key,
-    walrecord::NeonWalRecord,
-    walredo::{PostgresRedoManager, WalRedoError},
+    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
 use utils::{id::TenantId, lsn::Lsn};

@@ -25,7 +22,7 @@ fn redo_scenarios(c: &mut Criterion) {
    // input to the stderr.
    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();

-    let repo_dir = tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
+    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
@@ -35,9 +32,15 @@ fn redo_scenarios(c: &mut Criterion) {

    let manager = Arc::new(manager);

-    tracing::info!("executing first");
-    short().execute(&manager).unwrap();
-    tracing::info!("first executed");
+    {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        tracing::info!("executing first");
+        short().execute(rt.handle(), &manager).unwrap();
+        tracing::info!("first executed");
+    }

    let thread_counts = [1, 2, 4, 8, 16];

@@ -80,9 +83,14 @@ fn add_multithreaded_walredo_requesters(
    assert_ne!(threads, 0);

    if threads == 1 {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .unwrap();
+        let handle = rt.handle();
        b.iter_batched_ref(
            || Some(input_factory()),
-            |input| execute_all(input.take(), manager),
+            |input| execute_all(input.take(), handle, manager),
            criterion::BatchSize::PerIteration,
        );
    } else {
@@ -98,19 +106,26 @@ fn add_multithreaded_walredo_requesters(
                    let manager = manager.clone();
                    let barrier = barrier.clone();
                    let work_rx = work_rx.clone();
-                    move || loop {
-                        // queue up and wait if we want to go another round
-                        if work_rx.lock().unwrap().recv().is_err() {
-                            break;
+                    move || {
+                        let rt = tokio::runtime::Builder::new_current_thread()
+                            .enable_all()
+                            .build()
+                            .unwrap();
+                        let handle = rt.handle();
+                        loop {
+                            // queue up and wait if we want to go another round
+                            if work_rx.lock().unwrap().recv().is_err() {
+                                break;
+                            }
+
+                            let input = Some(input_factory());
+
+                            barrier.wait();
+
+                            execute_all(input, handle, &manager).unwrap();
+
+                            barrier.wait();
                        }
-
-                        let input = Some(input_factory());
-
-                        barrier.wait();
-
-                        execute_all(input, &manager).unwrap();
-
-                        barrier.wait();
                    }
                })
            })
@@ -152,15 +167,19 @@ impl Drop for JoinOnDrop {
    }
 }

-fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError>
+fn execute_all<I>(
+    input: I,
+    handle: &tokio::runtime::Handle,
+    manager: &PostgresRedoManager,
+) -> anyhow::Result<()>
 where
    I: IntoIterator<Item = Request>,
 {
    // just fire all requests as fast as possible
    input.into_iter().try_for_each(|req| {
-        let page = req.execute(manager)?;
+        let page = req.execute(handle, manager)?;
        assert_eq!(page.remaining(), 8192);
-        Ok::<_, WalRedoError>(())
+        anyhow::Ok(())
    })
 }

@@ -473,9 +492,11 @@ struct Request {
 }

 impl Request {
-    fn execute(self, manager: &PostgresRedoManager) -> Result<Bytes, WalRedoError> {
-        use pageserver::walredo::WalRedoManager;
-
+    fn execute(
+        self,
+        rt: &tokio::runtime::Handle,
+        manager: &PostgresRedoManager,
+    ) -> anyhow::Result<Bytes> {
        let Request {
            key,
            lsn,
@@ -484,6 +505,6 @@ impl Request {
            pg_version,
        } = self;

-        manager.request_redo(key, lsn, base_img, records, pg_version)
+        rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
    }
 }
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 bytes.workspace = true
+camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,13 +3,14 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
-use std::{fs, path::Path, str};
+use std::{fs, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
@@ -98,7 +99,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
+async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -167,7 +168,9 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
+                        let layer_path =
+                            Utf8PathBuf::from_path_buf(layer.path()).expect("non-Unicode path");
+                        layer_file.holes = get_holes(&layer_path, max_holes, &ctx).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -1,6 +1,7 @@
 use std::path::{Path, PathBuf};

 use anyhow::Result;
+use camino::Utf8Path;
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
@@ -47,7 +48,7 @@ pub(crate) enum LayerCmd {
 }

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
-    let path = path.as_ref();
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -8,6 +8,7 @@ mod draw_timeline_dir;
 mod layer_map_analyzer;
 mod layers;

+use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use layers::LayerCmd;
 use pageserver::{
@@ -18,7 +19,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_ffi::ControlFileData;
-use std::path::{Path, PathBuf};
 use utils::{lsn::Lsn, project_git_version};

 project_git_version!(GIT_VERSION);
@@ -49,7 +49,7 @@ enum Commands {
 #[derive(Parser)]
 struct MetadataCmd {
    /// Input metadata file path
-    metadata_path: PathBuf,
+    metadata_path: Utf8PathBuf,
    /// Replace disk consistent Lsn
    disk_consistent_lsn: Option<Lsn>,
    /// Replace previous record Lsn
@@ -61,13 +61,13 @@ struct MetadataCmd {
 #[derive(Parser)]
 struct PrintLayerFileCmd {
    /// Pageserver data path
-    path: PathBuf,
+    path: Utf8PathBuf,
 }

 #[derive(Parser)]
 struct AnalyzeLayerMapCmd {
    /// Pageserver data path
-    path: PathBuf,
+    path: Utf8PathBuf,
    /// Max holes
    max_holes: Option<usize>,
 }
@@ -102,7 +102,7 @@ async fn main() -> anyhow::Result<()> {
    Ok(())
 }

-fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
+fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
    println!("{control_file:?}");
    let control_file_initdb = Lsn(control_file.checkPoint);
@@ -114,7 +114,7 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,6 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, BytesMut};
 use fail::fail_point;
+use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
 use tokio::io;
@@ -180,6 +181,7 @@ where
            }
        }

+        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
@@ -213,6 +215,34 @@ where
                    self.add_rel(rel, rel).await?;
                }
            }
+
+            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
+                if path.starts_with("pg_replslot") {
+                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
+                    let restart_lsn = Lsn(u64::from_le_bytes(
+                        content[offs..offs + 8].try_into().unwrap(),
+                    ));
+                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
+                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+                }
+                let header = new_tar_header(&path, content.len() as u64)?;
+                self.ar
+                    .append(&header, &*content)
+                    .await
+                    .context("could not add aux file to basebackup tarball")?;
+            }
+        }
+        if min_restart_lsn != Lsn::MAX {
+            info!(
+                "Min restart LSN for logical replication is {}",
+                min_restart_lsn
+            );
+            let data = min_restart_lsn.0.to_le_bytes();
+            let header = new_tar_header("restart.lsn", data.len() as u64)?;
+            self.ar
+                .append(&header, &data[..])
+                .await
+                .context("could not add restart.lsn file to basebackup tarball")?;
        }
        for xid in self
            .timeline
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,9 +2,11 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
-use std::{env, ops::ControlFlow, path::Path, str::FromStr};
+use std::time::Duration;
+use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
+use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
@@ -65,21 +67,17 @@ fn main() -> anyhow::Result<()> {

    let workdir = arg_matches
        .get_one::<String>("workdir")
-        .map(Path::new)
-        .unwrap_or_else(|| Path::new(".neon"));
+        .map(Utf8Path::new)
+        .unwrap_or_else(|| Utf8Path::new(".neon"));
    let workdir = workdir
-        .canonicalize()
-        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
+        .canonicalize_utf8()
+        .with_context(|| format!("Error opening workdir '{workdir}'"))?;

    let cfg_file_path = workdir.join("pageserver.toml");

    // Set CWD to workdir for non-daemon modes
-    env::set_current_dir(&workdir).with_context(|| {
-        format!(
-            "Failed to set application's current dir to '{}'",
-            workdir.display()
-        )
-    })?;
+    env::set_current_dir(&workdir)
+        .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;

    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
@@ -115,12 +113,8 @@ fn main() -> anyhow::Result<()> {

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
-        utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
-            format!(
-                "Failed to create tenants root dir at '{}'",
-                tenants_path.display()
-            )
-        })?;
+        utils::crashsafe::create_dir_all(conf.tenants_path())
+            .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
    }

    // Initialize up failpoints support
@@ -137,9 +131,9 @@ fn main() -> anyhow::Result<()> {
 }

 fn initialize_config(
-    cfg_file_path: &Path,
+    cfg_file_path: &Utf8Path,
    arg_matches: clap::ArgMatches,
-    workdir: &Path,
+    workdir: &Utf8Path,
 ) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
    let init = arg_matches.get_flag("init");
    let update_config = init || arg_matches.get_flag("update-config");
@@ -147,33 +141,22 @@ fn initialize_config(
    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
        if init {
            anyhow::bail!(
-                "Config file '{}' already exists, cannot init it, use --update-config to update it",
-                cfg_file_path.display()
+                "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
            );
        }
        // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
-            format!(
-                "Failed to read pageserver config at '{}'",
-                cfg_file_path.display()
-            )
-        })?;
+        let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
+            .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
        (
            cfg_file_contents
                .parse::<toml_edit::Document>()
                .with_context(|| {
-                    format!(
-                        "Failed to parse '{}' as pageserver config",
-                        cfg_file_path.display()
-                    )
+                    format!("Failed to parse '{cfg_file_path}' as pageserver config")
                })?,
            true,
        )
    } else if cfg_file_path.exists() {
-        anyhow::bail!(
-            "Config file '{}' exists but is not a regular file",
-            cfg_file_path.display()
-        );
+        anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
    } else {
        // We're initializing the tenant, so there's no config file yet
        (
@@ -192,7 +175,7 @@ fn initialize_config(

            for (key, item) in doc.iter() {
                if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
-                    anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display());
+                    anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
                }
                toml.insert(key, item.clone());
            }
@@ -204,18 +187,11 @@ fn initialize_config(
        .context("Failed to parse pageserver configuration")?;

    if update_config {
-        info!("Writing pageserver config to '{}'", cfg_file_path.display());
+        info!("Writing pageserver config to '{cfg_file_path}'");

-        std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
-            format!(
-                "Failed to write pageserver config to '{}'",
-                cfg_file_path.display()
-            )
-        })?;
-        info!(
-            "Config successfully written to '{}'",
-            cfg_file_path.display()
-        )
+        std::fs::write(cfg_file_path, toml.to_string())
+            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
+        info!("Config successfully written to '{cfg_file_path}'")
    }

    Ok(if init {
@@ -225,6 +201,51 @@ fn initialize_config(
    })
 }

+struct WaitForPhaseResult<F: std::future::Future + Unpin> {
+    timeout_remaining: Duration,
+    skipped: Option<F>,
+}
+
+/// During startup, we apply a timeout to our waits for readiness, to avoid
+/// stalling the whole service if one Tenant experiences some problem.  Each
+/// phase may consume some of the timeout: this function returns the updated
+/// timeout for use in the next call.
+async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
+where
+    F: std::future::Future + Unpin,
+{
+    let initial_t = Instant::now();
+    let skipped = match tokio::time::timeout(timeout, &mut fut).await {
+        Ok(_) => None,
+        Err(_) => {
+            tracing::info!(
+                timeout_millis = timeout.as_millis(),
+                %phase,
+                "Startup phase timed out, proceeding anyway"
+            );
+            Some(fut)
+        }
+    };
+
+    WaitForPhaseResult {
+        timeout_remaining: timeout
+            .checked_sub(Instant::now().duration_since(initial_t))
+            .unwrap_or(Duration::ZERO),
+        skipped,
+    }
+}
+
+fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
+    let elapsed = started_at.elapsed();
+    let secs = elapsed.as_secs_f64();
+    STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+
+    info!(
+        elapsed_ms = elapsed.as_millis(),
+        "{human_phase} ({secs:.3}s since start)"
+    )
+}
+
 fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
@@ -232,16 +253,6 @@ fn start_pageserver(
    // Monotonic time for later calculating startup duration
    let started_startup_at = Instant::now();

-    let startup_checkpoint = move |phase: &str, human_phase: &str| {
-        let elapsed = started_startup_at.elapsed();
-        let secs = elapsed.as_secs_f64();
-        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "{human_phase} ({secs:.3}s since start)"
-        )
-    };
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -366,7 +377,7 @@ fn start_pageserver(

    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint("initial", "Starting loading tenants");
+    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
@@ -380,6 +391,7 @@ fn start_pageserver(
    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
    // (background_task_maximum_delay).
+    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
@@ -387,7 +399,8 @@ fn start_pageserver(
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
-        initial_tenant_load: Some(init_done_tx),
+        initial_tenant_load_remote: Some(init_done_tx),
+        initial_tenant_load: Some(init_remote_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
@@ -411,55 +424,93 @@ fn start_pageserver(
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial load completed")
+            });

-            init_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load", "Initial load completed");
-            STARTUP_IS_LOADING.set(0);
+            let timeout = conf.background_task_maximum_delay;
+
+            let init_remote_done = std::pin::pin!(async {
+                init_remote_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load_remote",
+                    "Remote part of initial load completed",
+                );
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_remote_skipped,
+            } = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
+
+            let init_load_done = std::pin::pin!(async {
+                init_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_tenant_load",
+                    "Initial load completed",
+                );
+                STARTUP_IS_LOADING.set(0);
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: timeout,
+                skipped: init_load_skipped,
+            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial logical sizes completed")
+            });

-            let timeout = conf.background_task_maximum_delay;
+            let logical_sizes_done = std::pin::pin!(async {
+                init_logical_size_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_logical_sizes",
+                    "Initial logical sizes completed",
+                );
+            });

-            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-
-            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
-                Ok(_) => {
-                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
-                    None
-                }
-                Err(_) => {
-                    tracing::info!(
-                        timeout_millis = timeout.as_millis(),
-                        "Initial logical size timeout elapsed; starting background jobs"
-                    );
-                    Some(init_sizes_done)
-                }
-            };
+            let WaitForPhaseResult {
+                timeout_remaining: _,
+                skipped: logical_sizes_skipped,
+            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;

            scopeguard::ScopeGuard::into_inner(guard);

-            // allow background jobs to start
+            // allow background jobs to start: we either completed prior stages, or they reached timeout
+            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
+            // because things like consumption metrics for billing are blocked by this barrier.
            drop(background_jobs_can_start);
-            startup_checkpoint("background_jobs_can_start", "Starting background jobs");
-
-            if let Some(init_sizes_done) = init_sizes_done {
-                // ending up here is not a bug; at the latest logical sizes will be queried by
-                // consumption metrics.
-                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
-                init_sizes_done.await;
-
-                scopeguard::ScopeGuard::into_inner(guard);
-
-                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+            startup_checkpoint(
+                started_startup_at,
+                "background_jobs_can_start",
+                "Starting background jobs",
+            );

+            // We are done. If we skipped any phases due to timeout, run them to completion here so that
+            // they will eventually update their startup_checkpoint, and so that we do not declare the
+            // 'complete' stage until all the other stages are really done.
+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before waiting for skipped phases done")
+            });
+            if let Some(f) = init_remote_skipped {
+                f.await;
            }
+            if let Some(f) = init_load_skipped {
+                f.await;
+            }
+            if let Some(f) = logical_sizes_skipped {
+                f.await;
+            }
+            scopeguard::ScopeGuard::into_inner(guard);

-            startup_checkpoint("complete", "Startup complete");
+            startup_checkpoint(started_startup_at, "complete", "Startup complete");
        };

        async move {
@@ -599,6 +650,7 @@ fn start_pageserver(
                    pageserver_listener,
                    conf.pg_auth_type,
                    libpq_ctx,
+                    task_mgr::shutdown_token(),
                )
                .await
            },
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -16,13 +16,13 @@ use utils::logging::SecretString;
 use once_cell::sync::OnceCell;
 use reqwest::Url;
 use std::num::NonZeroUsize;
-use std::path::{Path, PathBuf};
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use toml_edit;
 use toml_edit::{Document, Item};

+use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
 use utils::{
    id::{NodeId, TenantId, TimelineId},
@@ -37,8 +37,8 @@ use crate::tenant::{
    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
-    TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
+    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -153,9 +153,9 @@ pub struct PageServerConf {
    // that during unit testing, because the current directory is global
    // to the process but different unit tests work on different
    // repositories.
-    pub workdir: PathBuf,
+    pub workdir: Utf8PathBuf,

-    pub pg_distrib_dir: PathBuf,
+    pub pg_distrib_dir: Utf8PathBuf,

    // Authentication
    /// authentication method for the HTTP mgmt API
@@ -164,7 +164,7 @@ pub struct PageServerConf {
    pub pg_auth_type: AuthType,
    /// Path to a file containing public key for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
-    pub auth_validation_public_key_path: Option<PathBuf>,
+    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

    pub remote_storage_config: Option<RemoteStorageConfig>,

@@ -211,6 +211,10 @@ pub struct PageServerConf {

    /// JWT token for use with the control plane API.
    pub control_plane_api_token: Option<SecretString>,
+
+    /// If true, pageserver will make best-effort to operate without a control plane: only
+    /// for use in major incidents.
+    pub control_plane_emergency_mode: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -253,15 +257,15 @@ struct PageServerConfigBuilder {
    page_cache_size: BuilderValue<usize>,
    max_file_descriptors: BuilderValue<usize>,

-    workdir: BuilderValue<PathBuf>,
+    workdir: BuilderValue<Utf8PathBuf>,

-    pg_distrib_dir: BuilderValue<PathBuf>,
+    pg_distrib_dir: BuilderValue<Utf8PathBuf>,

    http_auth_type: BuilderValue<AuthType>,
    pg_auth_type: BuilderValue<AuthType>,

    //
-    auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
+    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

    id: BuilderValue<NodeId>,
@@ -288,6 +292,7 @@ struct PageServerConfigBuilder {

    control_plane_api: BuilderValue<Option<Url>>,
    control_plane_api_token: BuilderValue<Option<SecretString>>,
+    control_plane_emergency_mode: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -305,10 +310,12 @@ impl Default for PageServerConfigBuilder {
            superuser: Set(DEFAULT_SUPERUSER.to_string()),
            page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
            max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
-            workdir: Set(PathBuf::new()),
-            pg_distrib_dir: Set(env::current_dir()
-                .expect("cannot access current directory")
-                .join("pg_install")),
+            workdir: Set(Utf8PathBuf::new()),
+            pg_distrib_dir: Set(Utf8PathBuf::from_path_buf(
+                env::current_dir().expect("cannot access current directory"),
+            )
+            .expect("non-Unicode path")
+            .join("pg_install")),
            http_auth_type: Set(AuthType::Trust),
            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
@@ -353,6 +360,7 @@ impl Default for PageServerConfigBuilder {

            control_plane_api: Set(None),
            control_plane_api_token: Set(None),
+            control_plane_emergency_mode: Set(false),
        }
    }
 }
@@ -390,11 +398,11 @@ impl PageServerConfigBuilder {
        self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
    }

-    pub fn workdir(&mut self, workdir: PathBuf) {
+    pub fn workdir(&mut self, workdir: Utf8PathBuf) {
        self.workdir = BuilderValue::Set(workdir)
    }

-    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) {
+    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) {
        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
    }

@@ -408,7 +416,7 @@ impl PageServerConfigBuilder {

    pub fn auth_validation_public_key_path(
        &mut self,
-        auth_validation_public_key_path: Option<PathBuf>,
+        auth_validation_public_key_path: Option<Utf8PathBuf>,
    ) {
        self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
    }
@@ -485,6 +493,14 @@ impl PageServerConfigBuilder {
        self.control_plane_api = BuilderValue::Set(api)
    }

+    pub fn control_plane_api_token(&mut self, token: Option<SecretString>) {
+        self.control_plane_api_token = BuilderValue::Set(token)
+    }
+
+    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
+        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -576,6 +592,9 @@ impl PageServerConfigBuilder {
            control_plane_api_token: self
                .control_plane_api_token
                .ok_or(anyhow!("missing control_plane_api_token"))?,
+            control_plane_emergency_mode: self
+                .control_plane_emergency_mode
+                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
        })
    }
 }
@@ -585,15 +604,15 @@ impl PageServerConf {
    // Repository paths, relative to workdir.
    //

-    pub fn tenants_path(&self) -> PathBuf {
+    pub fn tenants_path(&self) -> Utf8PathBuf {
        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

-    pub fn deletion_prefix(&self) -> PathBuf {
+    pub fn deletion_prefix(&self) -> Utf8PathBuf {
        self.workdir.join("deletion")
    }

-    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+    pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
        const VERSION: u8 = 1;
@@ -602,7 +621,7 @@ impl PageServerConf {
            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
    }

-    pub fn deletion_header_path(&self) -> PathBuf {
+    pub fn deletion_header_path(&self) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
        const VERSION: u8 = 1;
@@ -610,30 +629,38 @@ impl PageServerConf {
        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
    }

-    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenants_path().join(tenant_id.to_string())
    }

-    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id)
            .join(TENANT_ATTACHING_MARKER_FILENAME)
    }

-    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }

    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
-    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf {
+    ///
+    /// Legacy: superseded by tenant_location_config_path.  Eventually
+    /// remove this function.
+    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
    }

-    pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_LOCATION_CONFIG_NAME)
+    }
+
+    pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
    }

-    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
+    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
        self.timelines_path(tenant_id).join(timeline_id.to_string())
    }

@@ -641,7 +668,7 @@ impl PageServerConf {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        path_with_suffix_extension(
            self.timeline_path(&tenant_id, &timeline_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
@@ -652,19 +679,19 @@ impl PageServerConf {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        path_with_suffix_extension(
            self.timeline_path(&tenant_id, &timeline_id),
            TIMELINE_DELETE_MARK_SUFFIX,
        )
    }

-    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }

-    pub fn traces_path(&self) -> PathBuf {
+    pub fn traces_path(&self) -> Utf8PathBuf {
        self.workdir.join("traces")
    }

@@ -673,7 +700,7 @@ impl PageServerConf {
        tenant_id: &TenantId,
        timeline_id: &TimelineId,
        connection_id: &ConnectionId,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        self.traces_path()
            .join(tenant_id.to_string())
            .join(timeline_id.to_string())
@@ -682,20 +709,20 @@ impl PageServerConf {

    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
+    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
        self.timeline_path(tenant_id, timeline_id)
            .join(METADATA_FILE_NAME)
    }

    /// Turns storage remote path of a file into its local path.
-    pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
+    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
    }

    //
    // Postgres distribution paths
    //
-    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        let path = self.pg_distrib_dir.clone();

        #[allow(clippy::manual_range_patterns)]
@@ -705,10 +732,10 @@ impl PageServerConf {
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

@@ -716,7 +743,7 @@ impl PageServerConf {
    /// validating the input and failing on errors.
    ///
    /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result<Self> {
+    pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
        let mut builder = PageServerConfigBuilder::default();
        builder.workdir(workdir.to_owned());

@@ -735,10 +762,10 @@ impl PageServerConf {
                    builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
                }
                "pg_distrib_dir" => {
-                    builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?))
+                    builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?))
                }
                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
-                    PathBuf::from(parse_toml_string(key, item)?),
+                    Utf8PathBuf::from(parse_toml_string(key, item)?),
                )),
                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
@@ -785,6 +812,18 @@ impl PageServerConf {
                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
                    }
                },
+                "control_plane_api_token" => {
+                    let parsed = parse_toml_string(key, item)?;
+                    if parsed.is_empty() {
+                        builder.control_plane_api_token(None)
+                    } else {
+                        builder.control_plane_api_token(Some(parsed.into()))
+                    }
+                },
+                "control_plane_emergency_mode" => {
+                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
+
+                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -798,8 +837,7 @@ impl PageServerConf {
            ensure!(
                auth_validation_public_key_path.exists(),
                format!(
-                    "Can't find auth_validation_public_key at '{}'",
-                    auth_validation_public_key_path.display()
+                    "Can't find auth_validation_public_key at '{auth_validation_public_key_path}'",
                )
            );
        }
@@ -915,12 +953,12 @@ impl PageServerConf {
    }

    #[cfg(test)]
-    pub fn test_repo_dir(test_name: &str) -> PathBuf {
-        PathBuf::from(format!("../tmp_check/test_{test_name}"))
+    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
+        Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

-    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
-        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
+    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
+        let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");

        PageServerConf {
            id: NodeId(0),
@@ -955,6 +993,7 @@ impl PageServerConf {
            background_task_maximum_delay: Duration::ZERO,
            control_plane_api: None,
            control_plane_api_token: None,
+            control_plane_emergency_mode: false,
        }
    }
 }
@@ -1087,8 +1126,8 @@ mod tests {
        num::{NonZeroU32, NonZeroUsize},
    };

+    use camino_tempfile::{tempdir, Utf8TempDir};
    use remote_storage::{RemoteStorageKind, S3Config};
-    use tempfile::{tempdir, TempDir};
    use utils::serde_percent::Percent;

    use super::*;
@@ -1127,8 +1166,7 @@ background_task_maximum_delay = '334 s'
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
        // we have to create dummy values to overcome the validation errors
        let config_string = format!(
-            "pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
-            pg_distrib_dir.display()
+            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
        );
        let toml = config_string.parse()?;

@@ -1179,7 +1217,8 @@ background_task_maximum_delay = '334 s'
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
                control_plane_api: None,
-                control_plane_api_token: None
+                control_plane_api_token: None,
+                control_plane_emergency_mode: false
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1194,8 +1233,7 @@ background_task_maximum_delay = '334 s'
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;

        let config_string = format!(
-            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
-            pg_distrib_dir.display()
+            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",
        );
        let toml = config_string.parse()?;

@@ -1236,7 +1274,8 @@ background_task_maximum_delay = '334 s'
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
                control_plane_api: None,
-                control_plane_api_token: None
+                control_plane_api_token: None,
+                control_plane_emergency_mode: false
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1255,23 +1294,18 @@ background_task_maximum_delay = '334 s'
        let identical_toml_declarations = &[
            format!(
                r#"[remote_storage]
-local_path = '{}'"#,
-                local_storage_path.display()
-            ),
-            format!(
-                "remote_storage={{local_path='{}'}}",
-                local_storage_path.display()
+local_path = '{local_storage_path}'"#,
            ),
+            format!("remote_storage={{local_path='{local_storage_path}'}}"),
        ];

        for remote_storage_config_str in identical_toml_declarations {
            let config_string = format!(
                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{}'
+pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'

 {remote_storage_config_str}"#,
-                pg_distrib_dir.display(),
            );

            let toml = config_string.parse()?;
@@ -1334,11 +1368,10 @@ concurrency_limit = {s3_concurrency_limit}"#
        for remote_storage_config_str in identical_toml_declarations {
            let config_string = format!(
                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{}'
+pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'

 {remote_storage_config_str}"#,
-                pg_distrib_dir.display(),
            );

            let toml = config_string.parse()?;
@@ -1380,12 +1413,11 @@ broker_endpoint = '{broker_endpoint}'

        let config_string = format!(
            r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{}'
+pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'

 [tenant_config]
 trace_read_requests = {trace_read_requests}"#,
-            pg_distrib_dir.display(),
        );

        let toml = config_string.parse()?;
@@ -1405,7 +1437,7 @@ trace_read_requests = {trace_read_requests}"#,
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;

        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{}"
+            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
 id = 222
@@ -1423,7 +1455,6 @@ kind = "LayerAccessThreshold"
 period = "20m"
 threshold = "20m"
 "#,
-            pg_distrib_dir.display(),
        );
        let toml: Document = pageserver_conf_toml.parse()?;
        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
@@ -1464,7 +1495,7 @@ threshold = "20m"
        Ok(())
    }

-    fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
+    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
        let tempdir_path = tempdir.path();

        let workdir = tempdir_path.join("workdir");
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,14 +2,16 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
+use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use std::collections::HashMap;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
+use tokio::time::Instant;
 use tracing::*;
 use utils::id::NodeId;

@@ -41,7 +43,7 @@ pub async fn collect_metrics(
    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
-    local_disk_storage: PathBuf,
+    local_disk_storage: Utf8PathBuf,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
    if _cached_metric_collection_interval != Duration::ZERO {
@@ -68,7 +70,7 @@ pub async fn collect_metrics(
        },
    );

-    let path: Arc<PathBuf> = Arc::new(local_disk_storage);
+    let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);

    let cancel = task_mgr::shutdown_token();

@@ -87,22 +89,12 @@ pub async fn collect_metrics(

    let node_id = node_id.to_string();

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
    loop {
-        let tick_at = tokio::select! {
-            _ = cancel.cancelled() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        if metrics.is_empty() {
-            continue;
-        }
-
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -141,10 +133,19 @@ pub async fn collect_metrics(
        let (_, _) = tokio::join!(flush, upload);

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            metric_collection_interval,
-            "consumption_metrics_collect_metrics",
+            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }

@@ -153,7 +154,7 @@ pub async fn collect_metrics(
 ///
 /// Cancellation safe.
 async fn restore_and_reschedule(
-    path: &Arc<PathBuf>,
+    path: &Arc<Utf8PathBuf>,
    metric_collection_interval: Duration,
 ) -> Cache {
    let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
@@ -243,16 +244,14 @@ async fn calculate_synthetic_size_worker(
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
+    scopeguard::defer! {
+        info!("calculate_synthetic_size_worker stopped");
+    };

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let tick_at = tokio::select! {
-            _ = task_mgr::shutdown_watcher() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
@@ -268,6 +267,11 @@ async fn calculate_synthetic_size_worker(
            }

            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+                // We can put in some prioritization for consumption metrics.
+                // Same for the loop that fetches computed metrics.
+                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
@@ -275,9 +279,18 @@ async fn calculate_synthetic_size_worker(
        }

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            synthetic_size_calculation_interval,
-            "consumption_metrics_synthetic_size_worker",
+            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + synthetic_size_calculation_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -1,10 +1,12 @@
 use anyhow::Context;
-use std::path::PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use std::sync::Arc;

 use super::RawMetric;

-pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
+pub(super) async fn read_metrics_from_disk(
+    path: Arc<Utf8PathBuf>,
+) -> anyhow::Result<Vec<RawMetric>> {
    // do not add context to each error, callsite will log with full path
    let span = tracing::Span::current();
    tokio::task::spawn_blocking(move || {
@@ -25,10 +27,10 @@ pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result
    .and_then(|x| x)
 }

-fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
+fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
    let it = std::fs::read_dir(path.parent().expect("caller checked"))?;

-    let prefix = path.file_name().expect("caller checked").to_string_lossy();
+    let prefix = path.file_name().expect("caller checked").to_string();

    for entry in it {
        let entry = entry?;
@@ -62,7 +64,7 @@ fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<(

 pub(super) async fn flush_metrics_to_disk(
    current_metrics: &Arc<Vec<RawMetric>>,
-    path: &Arc<PathBuf>,
+    path: &Arc<Utf8PathBuf>,
 ) -> anyhow::Result<()> {
    use std::io::Write;

@@ -81,7 +83,7 @@ pub(super) async fn flush_metrics_to_disk(

            let parent = path.parent().expect("existence checked");
            let file_name = path.file_name().expect("existence checked");
-            let mut tempfile = tempfile::Builder::new()
+            let mut tempfile = camino_tempfile::Builder::new()
                .prefix(file_name)
                .suffix(".tmp")
                .tempfile_in(parent)?;
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -133,6 +133,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            node_id: self.node_id,
        };

+        fail::fail_point!("control-plane-client-re-attach");
+
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
@@ -168,6 +170,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

+        fail::fail_point!("control-plane-client-validate");
+
        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

        Ok(response
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -3,7 +3,6 @@ mod list_writer;
 mod validator;

 use std::collections::HashMap;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;

@@ -13,6 +12,7 @@ use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
+use camino::Utf8PathBuf;
 use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
@@ -40,7 +40,6 @@ use validator::ValidatorQueueMessage;

 use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};

-// TODO: adminstrative "panic button" config property to disable all deletions
 // TODO: configurable for how long to wait before executing deletions

 /// We aggregate object deletions from many tenants in one place, for several reasons:
@@ -154,7 +153,7 @@ impl FlushOp {

 #[derive(Clone, Debug)]
 pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+    tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
@@ -186,7 +185,7 @@ where
    V: Serialize,
    I: AsRef<[u8]>,
 {
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));

    transformed
        .collect::<HashMap<String, &V>>()
@@ -213,7 +212,7 @@ where

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
-const TEMP_SUFFIX: &str = ".tmp";
+const TEMP_SUFFIX: &str = "tmp";

 #[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
@@ -325,10 +324,7 @@ impl DeletionList {
            return false;
        }

-        let timeline_entry = tenant_entry
-            .timelines
-            .entry(*timeline)
-            .or_insert_with(Vec::new);
+        let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default();

        let timeline_remote_path = remote_timeline_path(tenant, timeline);

@@ -336,7 +332,6 @@ impl DeletionList {
        timeline_entry.extend(objects.drain(..).map(|p| {
            p.strip_prefix(&timeline_remote_path)
                .expect("Timeline paths always start with the timeline prefix")
-                .to_string_lossy()
                .to_string()
        }));
        true
@@ -350,7 +345,7 @@ impl DeletionList {
                result.extend(
                    timeline_layers
                        .into_iter()
-                        .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
                );
            }
        }
@@ -421,7 +416,7 @@ pub enum DeletionQueueError {
 impl DeletionQueueClient {
    pub(crate) fn broken() -> Self {
        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::channel(1);
+        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
        Self {
            tx,
@@ -433,12 +428,12 @@ impl DeletionQueueClient {
    /// This is cancel-safe.  If you drop the future before it completes, the message
    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
    /// we decide to do a deletion the decision is always final.
-    async fn do_push<T>(
+    fn do_push<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
    ) -> Result<(), DeletionQueueError> {
-        match queue.send(msg).await {
+        match queue.send(msg) {
            Ok(_) => Ok(()),
            Err(e) => {
                // This shouldn't happen, we should shut down all tenants before
@@ -450,7 +445,7 @@ impl DeletionQueueClient {
        }
    }

-    pub(crate) async fn recover(
+    pub(crate) fn recover(
        &self,
        attached_tenants: HashMap<TenantId, Generation>,
    ) -> Result<(), DeletionQueueError> {
@@ -458,7 +453,6 @@ impl DeletionQueueClient {
            &self.tx,
            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
        )
-        .await
    }

    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
@@ -531,6 +525,21 @@ impl DeletionQueueClient {
            return self.flush_immediate().await;
        }

+        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
+    }
+
+    /// When a Tenant has a generation, push_layers is always synchronous because
+    /// the ListValidator channel is an unbounded channel.
+    ///
+    /// This can be merged into push_layers when we remove the Generation-less mode
+    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
+    pub(crate) fn push_layers_sync(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -544,17 +553,16 @@ impl DeletionQueueClient {
                objects: Vec::new(),
            }),
        )
-        .await
    }

    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
    async fn do_flush<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
        rx: tokio::sync::oneshot::Receiver<()>,
    ) -> Result<(), DeletionQueueError> {
-        self.do_push(queue, msg).await?;
+        self.do_push(queue, msg)?;
        if rx.await.is_err() {
            // This shouldn't happen if tenants are shut down before deletion queue.  If we
            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
@@ -575,6 +583,18 @@ impl DeletionQueueClient {
            .await
    }

+    /// Issue a flush without waiting for it to complete.  This is useful on advisory flushes where
+    /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
+    /// detach where flushing is nice but not necessary.
+    ///
+    /// This function provides no guarantees of work being done.
+    pub fn flush_advisory(&self) {
+        let (flush_op, _) = FlushOp::new();
+
+        // Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
+        drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
+    }
+
    // Wait until all previous deletions are executed
    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
        debug!("flush_execute: flushing to deletion lists...");
@@ -591,9 +611,7 @@ impl DeletionQueueClient {
        // Flush any immediate-mode deletions (the above backend flush will only flush
        // the executor if deletions had flowed through the backend)
        debug!("flush_execute: flushing execution...");
-        let (flush_op, rx) = FlushOp::new();
-        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
-            .await?;
+        self.flush_immediate().await?;
        debug!("flush_execute: finished flushing execution...");
        Ok(())
    }
@@ -648,8 +666,10 @@ impl DeletionQueue {
    where
        C: ControlPlaneGenerationsApi + Send + Sync,
    {
-        // Deep channel: it consumes deletions from all timelines and we do not want to block them
-        let (tx, rx) = tokio::sync::mpsc::channel(16384);
+        // Unbounded channel: enables non-async functions to submit deletions.  The actual length is
+        // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
+        // enough to avoid this taking pathologically large amount of memory.
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();

        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
@@ -727,12 +747,9 @@ impl DeletionQueue {

 #[cfg(test)]
 mod test {
+    use camino::Utf8Path;
    use hex_literal::hex;
-    use std::{
-        io::ErrorKind,
-        path::{Path, PathBuf},
-        time::Duration,
-    };
+    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -764,7 +781,7 @@ mod test {

    struct TestSetup {
        harness: TenantHarness,
-        remote_fs_dir: PathBuf,
+        remote_fs_dir: Utf8PathBuf,
        storage: GenericRemoteStorage,
        mock_control_plane: MockControlPlane,
        deletion_queue: DeletionQueue,
@@ -873,7 +890,7 @@ mod test {
        // Set up a GenericRemoteStorage targetting a directory
        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
            max_concurrent_syncs: std::num::NonZeroUsize::new(
                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
@@ -909,7 +926,7 @@ mod test {
    }

    // TODO: put this in a common location so that we can share with remote_timeline_client's tests
-    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
+    fn assert_remote_files(expected: &[&str], remote_path: &Utf8Path) {
        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
        expected.sort();

@@ -926,10 +943,7 @@ mod test {
                        unreachable!();
                    }
                } else {
-                    panic!(
-                        "Unexpected error listing {}: {e}",
-                        remote_path.to_string_lossy()
-                    );
+                    panic!("Unexpected error listing {remote_path}: {e}");
                }
            }
        };
@@ -944,7 +958,7 @@ mod test {
        assert_eq!(expected, found);
    }

-    fn assert_local_files(expected: &[&str], directory: &Path) {
+    fn assert_local_files(expected: &[&str], directory: &Utf8Path) {
        let dir = match std::fs::read_dir(directory) {
            Ok(d) => d,
            Err(_) => {
@@ -968,7 +982,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_id = ctx.harness.tenant_id;
@@ -1036,7 +1050,7 @@ mod test {
    async fn deletion_queue_validation() -> anyhow::Result<()> {
        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        // Generation that the control plane thinks is current
        let latest_generation = Generation::new(0xdeadbeef);
@@ -1093,7 +1107,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let tenant_id = ctx.harness.tenant_id;

@@ -1156,9 +1170,7 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client
-            .recover(HashMap::from([(tenant_id, now_generation)]))
-            .await?;
+        client.recover(HashMap::from([(tenant_id, now_generation)]))?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1184,7 +1196,7 @@ pub(crate) mod mock {
    };

    pub struct ConsumerState {
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
    }

@@ -1261,7 +1273,7 @@ pub(crate) mod mock {
    }

    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
        executed: Arc<AtomicUsize>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -1271,7 +1283,7 @@ pub(crate) mod mock {

    impl MockDeletionQueue {
        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, rx) = tokio::sync::mpsc::channel(16384);
+            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);

            let executed = Arc::new(AtomicUsize::new(0));
@@ -1286,10 +1298,6 @@ pub(crate) mod mock {
            }
        }

-        pub fn get_executed(&self) -> usize {
-            self.executed.load(Ordering::Relaxed)
-        }
-
        #[allow(clippy::await_holding_lock)]
        pub async fn pump(&self) {
            if let Some(remote_storage) = &self.remote_storage {
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -13,6 +13,7 @@ use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
+use utils::backoff;

 use crate::metrics;

@@ -63,7 +64,19 @@ impl Deleter {
            Err(anyhow::anyhow!("failpoint hit"))
        });

-        self.remote_storage.delete_objects(&self.accumulator).await
+        // A backoff::retry is used here for two reasons:
+        // - To provide a backoff rather than busy-polling the API on errors
+        // - To absorb transient 429/503 conditions without hitting our error
+        //   logging path for issues deleting objects.
+        backoff::retry(
+            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            |_| false,
+            3,
+            10,
+            "executing deletion batch",
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
+        )
+        .await
    }

    /// Block until everything in accumulator has been executed
@@ -88,7 +101,10 @@ impl Deleter {
                    self.accumulator.clear();
                }
                Err(e) => {
-                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    if self.cancel.is_cancelled() {
+                        return Err(DeletionQueueError::ShuttingDown);
+                    }
+                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
                    metrics::DELETION_QUEUE
                        .remote_errors
                        .with_label_values(&["execute"])
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -85,7 +85,7 @@ pub(super) struct ListWriter {
    conf: &'static PageServerConf,

    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+    rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,

    // Outbound requests to the backend to execute deletion lists we have composed.
    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
@@ -111,7 +111,7 @@ impl ListWriter {

    pub(super) fn new(
        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
        cancel: CancellationToken,
    ) -> Self {
@@ -180,8 +180,7 @@ impl ListWriter {
                    Ok(h) => Ok(Some(h.validated_sequence)),
                    Err(e) => {
                        warn!(
-                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
-                            header_path.display()
+                            "Failed to deserialize deletion header, ignoring {header_path}: {e:#}",
                        );
                        // This should never happen unless we make a mistake with our serialization.
                        // Ignoring a deletion header is not consequential for correctnes because all deletions
@@ -193,10 +192,7 @@ impl ListWriter {
            }
            Err(e) => {
                if e.kind() == std::io::ErrorKind::NotFound {
-                    debug!(
-                        "Deletion header {} not found, first start?",
-                        header_path.display()
-                    );
+                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
                    Err(anyhow::anyhow!(e))
@@ -223,10 +219,7 @@ impl ListWriter {
        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
            Ok(d) => d,
            Err(e) => {
-                warn!(
-                    "Failed to open deletion list directory {}: {e:#}",
-                    deletion_directory.display(),
-                );
+                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");

                // Give up: if we can't read the deletion list directory, we probably can't
                // write lists into it later, so the queue won't work.
@@ -237,27 +230,26 @@ impl ListWriter {
        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();

+        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

-            if Some(file_name.as_os_str()) == header_path.file_name() {
+            if file_name == header_path.file_name().unwrap_or("") {
                // Don't try and parse the header's name like a list
                continue;
            }

-            if dentry_str.ends_with(TEMP_SUFFIX) {
+            if dentry_str.ends_with(&temp_extension) {
                info!("Cleaning up temporary file {dentry_str}");
-                let absolute_path = deletion_directory.join(dentry.file_name());
+                let absolute_path =
+                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
                    // Non-fatal error: we will just leave the file behind but not
                    // try and load it.
-                    warn!(
-                        "Failed to clean up temporary file {}: {e:#}",
-                        absolute_path.display()
-                    );
+                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
                }

                continue;
@@ -360,7 +352,7 @@ impl ListWriter {
        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
-                self.conf.deletion_prefix().display()
+                self.conf.deletion_prefix(),
            );
            metrics::DELETION_QUEUE.unexpected_errors.inc();
            return;
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -15,10 +15,10 @@
 //! Deletions are passed onward to the Deleter.

 use std::collections::HashMap;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;

+use camino::Utf8PathBuf;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use tracing::info;
@@ -220,6 +220,8 @@ where
                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
+                } else {
+                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
                }
                this_list_valid
            });
@@ -282,16 +284,16 @@ where
        Ok(())
    }

-    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
+    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
-            debug!("Removing deletion list {}", list_path.display());
+            debug!("Removing deletion list {list_path}");

            if let Err(e) = tokio::fs::remove_file(&list_path).await {
                // Unexpected: we should have permissions and nothing else should
                // be touching these files.  We will leave the file behind.  Subsequent
                // pageservers will try and load it again: hopefully whatever storage
                // issue (probably permissions) has been fixed by then.
-                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+                tracing::error!("Failed to delete {list_path}: {e:#}");
                metrics::DELETION_QUEUE.unexpected_errors.inc();
                break;
            }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -43,12 +43,12 @@

 use std::{
    collections::HashMap,
-    path::Path,
    sync::Arc,
    time::{Duration, SystemTime},
 };

 use anyhow::Context;
+use camino::Utf8Path;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
@@ -122,7 +122,7 @@ async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: GenericRemoteStorage,
-    tenants_dir: &Path,
+    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -184,7 +184,7 @@ async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Path,
+    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
@@ -411,6 +411,11 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                                evictions_failed.file_sizes += file_size;
                                evictions_failed.count += 1;
                            }
+                            Some(Err(EvictionError::MetadataInconsistency(detail))) => {
+                                warn!(%layer, "failed to evict layer: {detail}");
+                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.count += 1;
+                            }
                            None => {
                                assert!(cancel.is_cancelled());
                                return;
@@ -620,9 +625,8 @@ impl std::ops::Deref for TimelineKey {
 }

 mod filesystem_level_usage {
-    use std::path::Path;
-
    use anyhow::Context;
+    use camino::Utf8Path;

    use crate::statvfs::Statvfs;

@@ -664,7 +668,7 @@ mod filesystem_level_usage {
    }

    pub fn get<'a>(
-        tenants_dir: &Path,
+        tenants_dir: &Utf8Path,
        config: &'a DiskUsageEvictionTaskConfig,
    ) -> anyhow::Result<Usage<'a>> {
        let mock_config = {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,9 +93,16 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    delete:
      description: |
-        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
        "400":
@@ -134,6 +141,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -178,6 +192,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
@@ -226,6 +247,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
@@ -265,7 +293,74 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    get:
+      description: Get timestamp for a given LSN
+      parameters:
+        - name: lsn
+          in: query
+          required: true
+          schema:
+            type: integer
+          description: A LSN to get the timestamp
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+                format: date-time
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Timeline not found, or there is no timestamp information for the given lsn
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
        "500":
          description: Generic operation error
          content:
@@ -328,6 +423,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -375,6 +477,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
@@ -465,6 +574,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/detach:
    parameters:
@@ -518,6 +634,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/ignore:
    parameters:
@@ -560,6 +683,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/load:
    parameters:
@@ -604,6 +734,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -641,6 +778,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/size:
    parameters:
@@ -704,6 +847,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -780,6 +930,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -810,6 +967,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    post:
      description: |
        Create a tenant. Returns new tenant id on success.
@@ -860,6 +1024,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/config:
    put:
@@ -905,6 +1076,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/config/:
    parameters:
      - name: tenant_id
@@ -954,6 +1132,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
 components:
  securitySchemes:
    JWT:
@@ -1220,6 +1405,13 @@ components:
      properties:
        msg:
          type: string
+    ServiceUnavailableError:
+      type: object
+      required:
+        - msg
+      properties:
+        msg:
+          type: string
    NotFoundError:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,15 +2,19 @@
 //! Management HTTP API
 //!
 use std::collections::HashMap;
+use std::str::FromStr;
 use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
+use humantime::format_rfc3339;
+use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
+    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
+    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -29,7 +33,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
-use crate::tenant::config::TenantConfOpt;
+use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
 };
@@ -75,7 +79,7 @@ impl State {
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<Self> {
-        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
+        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
@@ -132,11 +136,9 @@ impl From<PageReconstructError> for ApiError {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
            PageReconstructError::AncestorStopping(_) => {
-                ApiError::InternalServerError(anyhow::Error::new(pre))
-            }
-            PageReconstructError::WalRedo(pre) => {
-                ApiError::InternalServerError(anyhow::Error::new(pre))
+                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
+            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
 }
@@ -145,12 +147,15 @@ impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::InternalServerError(anyhow::Error::new(tmie))
+                ApiError::ResourceUnavailable(format!("{tmie}").into())
            }
            TenantMapInsertError::TenantAlreadyExists(id, state) => {
                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
            }
-            TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
+            TenantMapInsertError::TenantExistsSecondary(id) => {
+                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
+            }
+            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }
@@ -159,6 +164,9 @@ impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            TenantStateError::IsStopping(_) => {
+                ApiError::ResourceUnavailable("Tenant is stopping".into())
+            }
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
@@ -168,14 +176,17 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
-            e @ GetTenantError::NotActive(_) => {
+            GetTenantError::Broken(reason) => {
+                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
+            }
+            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
                //
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
-                ApiError::InternalServerError(anyhow::Error::new(e))
+                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
        }
    }
@@ -382,6 +393,9 @@ async fn timeline_create_handler(
                    format!("{err:#}")
                ))
            }
+            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
+                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
+            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -490,6 +504,33 @@ async fn get_lsn_by_timestamp_handler(
    json_response(StatusCode::OK, result)
 }

+async fn get_timestamp_of_lsn_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let lsn_str = must_get_query_param(&request, "lsn")?;
+    let lsn = Lsn::from_str(&lsn_str)
+        .with_context(|| format!("Invalid LSN: {lsn_str:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
+
+    match result {
+        Some(time) => {
+            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
+            json_response(StatusCode::OK, time)
+        }
+        None => json_response(StatusCode::NOT_FOUND, ()),
+    }
+}
+
 async fn tenant_attach_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -558,9 +599,14 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
-        .instrument(info_span!("tenant_detach", %tenant_id))
-        .await?;
+    mgr::detach_tenant(
+        conf,
+        tenant_id,
+        detach_ignored.unwrap_or(false),
+        &state.deletion_queue_client,
+    )
+    .instrument(info_span!("tenant_detach", %tenant_id))
+    .await?;

    json_response(StatusCode::OK, ())
 }
@@ -622,8 +668,9 @@ async fn tenant_list_handler(
    let response_data = mgr::list_tenants()
        .instrument(info_span!("tenant_list"))
        .await
-        .map_err(anyhow::Error::new)
-        .map_err(ApiError::InternalServerError)?
+        .map_err(|_| {
+            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
+        })?
        .iter()
        .map(|(id, state)| TenantInfo {
            id: *id,
@@ -1001,6 +1048,56 @@ async fn update_tenant_config_handler(
    json_response(StatusCode::OK, ())
 }

+async fn put_tenant_location_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+    let state = get_state(&request);
+    let conf = state.conf;
+
+    // The `Detached` state is special, it doesn't upsert a tenant, it removes
+    // its local disk content and drops it from memory.
+    if let LocationConfigMode::Detached = request_data.config.mode {
+        if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
+            .instrument(info_span!("tenant_detach", %tenant_id))
+            .await
+        {
+            match e {
+                TenantStateError::NotFound(_) => {
+                    // This API is idempotent: a NotFound on a detach is fine.
+                }
+                _ => return Err(e.into()),
+            }
+        }
+        return json_response(StatusCode::OK, ());
+    }
+
+    let location_conf =
+        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
+
+    mgr::upsert_location(
+        state.conf,
+        tenant_id,
+        location_conf,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .await
+    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+    // principle we might have hit something like concurrent API calls to the same tenant,
+    // which is not a 400 but a 409.
+    .map_err(ApiError::BadRequest)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1180,6 +1277,136 @@ async fn deletion_queue_flush(
    }
 }

+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+async fn getpage_at_lsn_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    struct Key(crate::repository::Key);
+
+    impl std::str::FromStr for Key {
+        type Err = anyhow::Error;
+
+        fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+            crate::repository::Key::from_hex(s).map(Key)
+        }
+    }
+
+    let key: Key = parse_query_param(&request, "key")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
+    let lsn: Lsn = parse_query_param(&request, "lsn")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+
+        let page = timeline.get(key.0, lsn, &ctx).await?;
+
+        Result::<_, ApiError>::Ok(
+            Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .body(hyper::Body::from(page))
+                .unwrap(),
+        )
+    }
+    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
+    .await
+}
+
+async fn timeline_collect_keyspace(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    struct Partitioning {
+        keys: crate::keyspace::KeySpace,
+
+        at_lsn: Lsn,
+    }
+
+    impl serde::Serialize for Partitioning {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeMap;
+            let mut map = serializer.serialize_map(Some(2))?;
+            map.serialize_key("keys")?;
+            map.serialize_value(&KeySpace(&self.keys))?;
+            map.serialize_key("at_lsn")?;
+            map.serialize_value(&WithDisplay(&self.at_lsn))?;
+            map.end()
+        }
+    }
+
+    struct WithDisplay<'a, T>(&'a T);
+
+    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            serializer.collect_str(&self.0)
+        }
+    }
+
+    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
+
+    impl<'a> serde::Serialize for KeySpace<'a> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeSeq;
+            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
+            for kr in &self.0.ranges {
+                seq.serialize_element(&KeyRange(kr))?;
+            }
+            seq.end()
+        }
+    }
+
+    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
+
+    impl<'a> serde::Serialize for KeyRange<'a> {
+        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeTuple;
+            let mut t = serializer.serialize_tuple(2)?;
+            t.serialize_element(&WithDisplay(&self.0.start))?;
+            t.serialize_element(&WithDisplay(&self.0.end))?;
+            t.end()
+        }
+    }
+
+    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
+
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
+        let keys = timeline
+            .collect_keyspace(at_lsn, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
+    }
+    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
+    .await
+}
+
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -1454,6 +1681,9 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
+        .put("/v1/tenant/:tenant_id/location_config", |r| {
+            api_handler(r, put_tenant_location_config_handler)
+        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
@@ -1479,6 +1709,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
+            |r| api_handler(r, get_timestamp_of_lsn_handler),
+        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
            api_handler(r, timeline_gc_handler)
        })
@@ -1524,5 +1758,12 @@ pub fn make_router(
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
+            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
+        })
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
+            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -6,6 +6,7 @@ use std::path::{Path, PathBuf};

 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
+use camino::Utf8Path;
 use futures::StreamExt;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
@@ -29,7 +30,7 @@ use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

 // Returns checkpoint LSN from controlfile
-pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
+pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
    // Read control file to extract the LSN
    let controlfile_path = path.join("global").join("pg_control");
    let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
@@ -46,7 +47,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// cluster was not shut down cleanly.
 pub async fn import_timeline_from_postgres_datadir(
    tline: &Timeline,
-    pgdata_path: &Path,
+    pgdata_path: &Utf8Path,
    pgdata_lsn: Lsn,
    ctx: &RequestContext,
 ) -> Result<()> {
@@ -256,7 +257,7 @@ async fn import_slru(
 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
 async fn import_wal(
-    walpath: &Path,
+    walpath: &Utf8Path,
    tline: &Timeline,
    startpoint: Lsn,
    endpoint: Lsn,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,9 +25,8 @@ pub mod walredo;

 pub mod failpoint_support;

-use std::path::Path;
-
 use crate::task_mgr::TaskKind;
+use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tracing::info;

@@ -113,6 +112,10 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_CONFIG_NAME: &str = "config";

+/// Per-tenant configuration file.
+/// Full path: `tenants/<tenant_id>/config`.
+pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
+
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
@@ -132,25 +135,25 @@ pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
 /// Full path: `tenants/<tenant_id>/___ignored_tenant`.
 pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";

-pub fn is_temporary(path: &Path) -> bool {
+pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
+        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
        None => false,
    }
 }

-fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
+fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(suffix),
+        Some(name) => name.ends_with(suffix),
        None => false,
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
+pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }

-pub fn is_delete_mark(path: &Path) -> bool {
+pub fn is_delete_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }

@@ -170,6 +173,9 @@ fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
 /// delaying is needed.
 #[derive(Clone)]
 pub struct InitializationOrder {
+    /// Each initial tenant load task carries this until it is done loading timelines from remote storage
+    pub initial_tenant_load_remote: Option<utils::completion::Completion>,
+
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -94,15 +94,35 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
 });

 // Metrics collected on operations on the storage repository.
-pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+
+pub(crate) struct ReconstructTimeMetrics {
+    ok: Histogram,
+    err: Histogram,
+}
+
+pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
+        &["result"],
        CRITICAL_OP_BUCKETS.into(),
    )
-    .expect("failed to define a metric")
+    .expect("failed to define a metric");
+    ReconstructTimeMetrics {
+        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
+        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
+    }
 });

+impl ReconstructTimeMetrics {
+    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
+        match result {
+            Ok(_) => &self.ok,
+            Err(_) => &self.err,
+        }
+    }
+}
+
 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
@@ -264,6 +284,46 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

+pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_cache_acquire_pinned_slot_seconds",
+        "Time spent acquiring a pinned slot in the page cache",
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_cache_find_victim_iters_total",
+        "Counter for the number of iterations in the find_victim loop",
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "page_cache_errors_total",
+        "Number of timeouts while acquiring a pinned slot in the page cache",
+        &["error_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum PageCacheErrorKind {
+    AcquirePinnedSlotTimeout,
+    EvictIterLimit,
+}
+
+pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
+    PAGE_CACHE_ERRORS
+        .get_metric_with_label_values(&[error_kind.into()])
+        .unwrap()
+        .inc();
+}
+
 pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wait_lsn_seconds",
@@ -631,10 +691,9 @@ impl StorageIoTime {
        .expect("failed to define a metric");
        let metrics = std::array::from_fn(|i| {
            let op = StorageIoOperation::from_repr(i).unwrap();
-            let metric = storage_io_histogram_vec
+            storage_io_histogram_vec
                .get_metric_with_label_values(&[op.as_str()])
-                .unwrap();
-            metric
+                .unwrap()
        });
        Self { metrics }
    }
@@ -907,6 +966,7 @@ pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
    pub(crate) keys_executed: IntCounter,
+    pub(crate) keys_validated: IntCounter,
    pub(crate) dropped_lsn_updates: IntCounter,
    pub(crate) unexpected_errors: IntCounter,
    pub(crate) remote_errors: IntCounterVec,
@@ -928,7 +988,13 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {

    keys_executed: register_int_counter!(
        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
+    )
+    .expect("failed to define a metric"),
+
+    keys_validated: register_int_counter!(
+        "pageserver_deletion_queue_validated_total",
+        "Number of keys validated for deletion.  Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
    )
    .expect("failed to define a metric"),

@@ -1001,6 +1067,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("Failed to register tenant_task_events metric")
 });

+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            &["task"],
+        )
+        .unwrap()
+    });
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap()
+    });
+
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_background_loop_period_overrun_count",
@@ -1816,7 +1902,6 @@ pub fn preinitialize_metrics() {
    // histograms
    [
        &READ_NUM_FS_LAYERS,
-        &RECONSTRUCT_TIME,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_WAIT_TIME,
@@ -1827,4 +1912,7 @@ pub fn preinitialize_metrics() {
    .for_each(|h| {
        Lazy::force(h);
    });
+
+    // Custom
+    Lazy::force(&RECONSTRUCT_TIME);
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -66,8 +66,7 @@
 //! inserted to the mapping, but you must hold the write-lock on the slot until
 //! the contents are valid. If you need to release the lock without initializing
 //! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
-//! page, the caller must explicitly call guard.mark_valid() after it has
+//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
 //! initialized it. If the guard is dropped without calling mark_valid(), the
 //! mapping is automatically removed and the slot is marked free.
 //!
@@ -75,7 +74,11 @@
 use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
-    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+    sync::{
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+        Arc, Weak,
+    },
+    time::Duration,
 };

 use anyhow::Context;
@@ -165,6 +168,8 @@ struct Slot {

 struct SlotInner {
    key: Option<CacheKey>,
+    // for `coalesce_readers_permit`
+    permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
    buf: &'static mut [u8; PAGE_SZ],
 }

@@ -207,6 +212,22 @@ impl Slot {
    }
 }

+impl SlotInner {
+    /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
+    fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
+        let mut guard = self.permit.lock().unwrap();
+        if let Some(existing_permit) = guard.upgrade() {
+            drop(guard);
+            drop(permit);
+            existing_permit
+        } else {
+            let permit = Arc::new(permit);
+            *guard = Arc::downgrade(&permit);
+            permit
+        }
+    }
+}
+
 pub struct PageCache {
    /// This contains the mapping from the cache key to buffer slot that currently
    /// contains the page, if any.
@@ -224,6 +245,8 @@ pub struct PageCache {
    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,

+    pinned_slots: Arc<tokio::sync::Semaphore>,
+
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
@@ -231,23 +254,28 @@ pub struct PageCache {
    size_metrics: &'static PageCacheSizeMetrics,
 }

+struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i> {
+    _permit: Arc<PinnedSlotsPermit>,
+    slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
+}

 impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.0.buf
+        self.slot_guard.buf
    }
 }

 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
    fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.0.buf
+        self.slot_guard.buf
    }
 }

@@ -257,21 +285,25 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 ///
 /// Counterintuitively, this is used even for a read, if the requested page is not
 /// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid(). Similarly
-/// lock_for_write() can return an invalid buffer that the caller is expected to
-/// to initialize.
-///
+/// is expected to fill in the page contents and call mark_valid().
 pub struct PageWriteGuard<'i> {
-    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+    state: PageWriteGuardState<'i>,
+}

-    // Are the page contents currently valid?
-    // Used to mark pages as invalid that are assigned but not yet filled with data.
-    valid: bool,
+enum PageWriteGuardState<'i> {
+    Invalid {
+        inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+        _permit: PinnedSlotsPermit,
+    },
+    Downgraded,
 }

 impl std::ops::DerefMut for PageWriteGuard<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.inner.buf
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

@@ -279,25 +311,28 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.inner.buf
+        match &self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

-impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
-    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        self.inner.buf
-    }
-}
-
-impl PageWriteGuard<'_> {
+impl<'a> PageWriteGuard<'a> {
    /// Mark that the buffer contents are now valid.
-    pub fn mark_valid(&mut self) {
-        assert!(self.inner.key.is_some());
-        assert!(
-            !self.valid,
-            "mark_valid called on a buffer that was already valid"
-        );
-        self.valid = true;
+    #[must_use]
+    pub fn mark_valid(mut self) -> PageReadGuard<'a> {
+        let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
+        match prev {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                PageReadGuard {
+                    _permit: Arc::new(_permit),
+                    slot_guard: inner.downgrade(),
+                }
+            }
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

@@ -308,11 +343,14 @@ impl Drop for PageWriteGuard<'_> {
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
-        assert!(self.inner.key.is_some());
-        if !self.valid {
-            let self_key = self.inner.key.as_ref().unwrap();
-            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-            self.inner.key = None;
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                let self_key = inner.key.as_ref().unwrap();
+                PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+                inner.key = None;
+            }
+            PageWriteGuardState::Downgraded => {}
        }
    }
 }
@@ -323,12 +361,6 @@ pub enum ReadBufResult<'a> {
    NotFound(PageWriteGuard<'a>),
 }

-/// lock_for_write() return value
-pub enum WriteBufResult<'a> {
-    Found(PageWriteGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
 impl PageCache {
    //
    // Section 1.1: Public interface functions for looking up and memorizing materialized page
@@ -348,6 +380,10 @@ impl PageCache {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Option<(Lsn, PageReadGuard)> {
+        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+            return None;
+        };
+
        crate::metrics::PAGE_CACHE
            .for_ctx(ctx)
            .read_accesses_materialized_page
@@ -362,7 +398,10 @@ impl PageCache {
            lsn,
        };

-        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
+        if let Some(guard) = self
+            .try_lock_for_read(&mut cache_key, &mut Some(permit))
+            .await
+        {
            if let CacheKey::MaterializedPage {
                hash_key: _,
                lsn: available_lsn,
@@ -408,20 +447,77 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key).await? {
-            WriteBufResult::Found(write_guard) => {
-                // We already had it in cache. Another thread must've put it there
-                // concurrently. Check that it had the same contents that we
-                // replayed.
-                assert!(*write_guard == img);
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
+                // The page was found in the mapping. Lock the slot, and re-check
+                // that it's still what we expected (because we don't released the mapping
+                // lock already, another thread could have evicted the page)
+                let slot = &self.slots[slot_idx];
+                let inner = slot.inner.write().await;
+                if inner.key.as_ref() == Some(&cache_key) {
+                    slot.inc_usage_count();
+                    debug_assert!(
+                        {
+                            let guard = inner.permit.lock().unwrap();
+                            guard.upgrade().is_none()
+                        },
+                        "we hold a write lock, so, no one else should have a permit"
+                    );
+                    debug_assert_eq!(inner.buf.len(), img.len());
+                    // We already had it in cache. Another thread must've put it there
+                    // concurrently. Check that it had the same contents that we
+                    // replayed.
+                    assert!(inner.buf == img);
+                    return Ok(());
+                }
            }
-            WriteBufResult::NotFound(mut write_guard) => {
-                write_guard.copy_from_slice(img);
-                write_guard.mark_valid();
-            }
-        }
+            debug_assert!(permit.is_some());

-        Ok(())
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+            // Create a write guard for the slot so we go through the expected motions.
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+            let mut write_guard = PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
+            };
+            write_guard.copy_from_slice(img);
+            let _ = write_guard.mark_valid();
+            return Ok(());
+        }
    }

    // Section 1.2: Public interface functions for working with immutable file pages.
@@ -445,6 +541,29 @@ impl PageCache {
    // "mappings" after this section. But the routines in this section should
    // not require changes.

+    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
+        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
+        match tokio::time::timeout(
+            // Choose small timeout, neon_smgr does its own retries.
+            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
+            Duration::from_secs(10),
+            Arc::clone(&self.pinned_slots).acquire_owned(),
+        )
+        .await
+        {
+            Ok(res) => Ok(PinnedSlotsPermit(
+                res.expect("this semaphore is never closed"),
+            )),
+            Err(_timeout) => {
+                timer.stop_and_discard();
+                crate::metrics::page_cache_errors_inc(
+                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
+                );
+                anyhow::bail!("timeout: there were page guards alive for all page cache slots")
+            }
+        }
+    }
+
    /// Look up a page in the cache.
    ///
    /// If the search criteria is not exact, *cache_key is updated with the key
@@ -454,7 +573,11 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+    async fn try_lock_for_read(
+        &self,
+        cache_key: &mut CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
@@ -464,7 +587,10 @@ impl PageCache {
            let inner = slot.inner.read().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
-                return Some(PageReadGuard(inner));
+                return Some(PageReadGuard {
+                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
+                    slot_guard: inner,
+                });
            } else {
                // search_mapping might have modified the search key; restore it.
                *cache_key = cache_key_orig;
@@ -507,6 +633,8 @@ impl PageCache {
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
@@ -523,17 +651,21 @@ impl PageCache {
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
+            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
                if is_first_iteration {
                    hit.inc();
                }
                return Ok(ReadBufResult::Found(read_guard));
            }
+            debug_assert!(permit.is_some());
            is_first_iteration = false;

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -555,70 +687,19 @@ impl PageCache {
            inner.key = Some(cache_key.clone());
            slot.set_usage_count(1);

+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
-            }));
-        }
-    }
-
-    /// Look up a page in the cache and lock it in write mode. If it's not
-    /// found, returns None.
-    ///
-    /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
-        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we don't released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().await;
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                return Some(PageWriteGuard { inner, valid: true });
-            }
-        }
-        None
-    }
-
-    /// Return a write-locked buffer for given block.
-    ///
-    /// Similar to lock_for_read(), but the returned buffer is write-locked and
-    /// may be modified by the caller even if it's already found in the cache.
-    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
-                return Ok(WriteBufResult::Found(write_guard));
-            }
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-
-            return Ok(WriteBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
            }));
        }
    }
@@ -663,7 +744,7 @@ impl PageCache {
    ///
    /// Like 'search_mapping, but performs an "exact" search. Used for
    /// allocating a new buffer.
-    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
+    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
        match key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
                let map = self.materialized_page_map.read().unwrap();
@@ -769,7 +850,10 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+    async fn find_victim(
+        &self,
+        _permit_witness: &PinnedSlotsPermit,
+    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
@@ -782,13 +866,40 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        // If we have looped through the whole buffer pool 10 times
-                        // and still haven't found a victim buffer, something's wrong.
-                        // Maybe all the buffers were in locked. That could happen in
-                        // theory, if you have more threads holding buffers locked than
-                        // there are buffers in the pool. In practice, with a reasonably
-                        // large buffer pool it really shouldn't happen.
                        if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
@@ -799,6 +910,7 @@ impl PageCache {
                    self.remove_mapping(old_key);
                    inner.key = None;
                }
+                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                return Ok((slot_idx, inner));
            }
        }
@@ -826,7 +938,11 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -838,6 +954,7 @@ impl PageCache {
            slots,
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -35,6 +35,7 @@ use std::time::Duration;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
+use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -64,69 +65,6 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
-where
-    IO: AsyncRead + AsyncWrite + Unpin,
-{
-    async_stream::try_stream! {
-        loop {
-            let msg = tokio::select! {
-                biased;
-
-                _ = task_mgr::shutdown_watcher() => {
-                    // We were requested to shut down.
-                    let msg = "pageserver is shutting down";
-                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                    Err(QueryError::Other(anyhow::anyhow!(msg)))
-                }
-
-                msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-            };
-
-            match msg {
-                Ok(Some(message)) => {
-                    let copy_data_bytes = match message {
-                        FeMessage::CopyData(bytes) => bytes,
-                        FeMessage::CopyDone => { break },
-                        FeMessage::Sync => continue,
-                        FeMessage::Terminate => {
-                            let msg = "client terminated connection with Terminate message during COPY";
-                            let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                            // error can't happen here, ErrorResponse serialization should be always ok
-                            pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                            break;
-                        }
-                        m => {
-                            let msg = format!("unexpected message {m:?}");
-                            // error can't happen here, ErrorResponse serialization should be always ok
-                            pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                            Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                            break;
-                        }
-                    };
-
-                    yield copy_data_bytes;
-                }
-                Ok(None) => {
-                    let msg = "client closed connection during COPY";
-                    let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                    // error can't happen here, ErrorResponse serialization should be always ok
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                }
-                Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                    Err(io_error)?;
-                }
-                Err(other) => {
-                    Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                }
-            };
-        }
-    }
-}
-
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -184,6 +122,7 @@ pub async fn libpq_listener_main(
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
+    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    listener.set_nonblocking(true)?;
    let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -192,7 +131,7 @@ pub async fn libpq_listener_main(
    while let Some(res) = tokio::select! {
        biased;

-        _ = task_mgr::shutdown_watcher() => {
+        _ = cancel.cancelled() => {
            // We were requested to shut down.
            None
        }
@@ -284,7 +223,13 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(
+        conf,
+        broker_client,
+        auth,
+        connection_ctx,
+        task_mgr::shutdown_token(),
+    );
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -318,6 +263,10 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
+
+    /// A token that should fire when the tenant transitions from
+    /// attached state, or when the pageserver is shutting down.
+    cancel: CancellationToken,
 }

 impl PageServerHandler {
@@ -326,6 +275,7 @@ impl PageServerHandler {
        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
+        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -333,6 +283,91 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            cancel,
+        }
+    }
+
+    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
+    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
+    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
+    /// in the flush.
+    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        tokio::select!(
+            flush_r = pgb.flush() => {
+                Ok(flush_r?)
+            },
+            _ = self.cancel.cancelled() => {
+                Err(QueryError::Shutdown)
+            }
+        )
+    }
+
+    fn copyin_stream<'a, IO>(
+        &'a self,
+        pgb: &'a mut PostgresBackend<IO>,
+    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        async_stream::try_stream! {
+            loop {
+                let msg = tokio::select! {
+                    biased;
+
+                    _ = self.cancel.cancelled() => {
+                        // We were requested to shut down.
+                        let msg = "pageserver is shutting down";
+                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                        Err(QueryError::Shutdown)
+                    }
+
+                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+                };
+
+                match msg {
+                    Ok(Some(message)) => {
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            FeMessage::CopyDone => { break },
+                            FeMessage::Sync => continue,
+                            FeMessage::Terminate => {
+                                let msg = "client terminated connection with Terminate message during COPY";
+                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                                break;
+                            }
+                            m => {
+                                let msg = format!("unexpected message {m:?}");
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                                break;
+                            }
+                        };
+
+                        yield copy_data_bytes;
+                    }
+                    Ok(None) => {
+                        let msg = "client closed connection during COPY";
+                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                        // error can't happen here, ErrorResponse serialization should be always ok
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                    }
+                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                        Err(io_error)?;
+                    }
+                    Err(other) => {
+                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                    }
+                };
+            }
        }
    }

@@ -372,7 +407,7 @@ impl PageServerHandler {

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -380,10 +415,10 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;

-                _ = task_mgr::shutdown_watcher() => {
+                _ = self.cancel.cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
-                    break;
+                    return Err(QueryError::Shutdown)
                }

                msg = pgb.read_message() => { msg }
@@ -412,38 +447,60 @@ impl PageServerHandler {
            // TODO: We could create a new per-request context here, with unique ID.
            // Currently we use the same per-timeline context for all requests

-            let response = match neon_fe_msg {
+            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
-                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
-                        .await
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
-                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_nblocks_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
                PagestreamFeMessage::GetPage(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
-                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
-                        .await
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
                PagestreamFeMessage::DbSize(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
-                    self.handle_db_size_request(&timeline, &req, &ctx).await
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
+                    (
+                        self.handle_db_size_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
            };

            let response = response.unwrap_or_else(|e| {
                // print the all details to the log with {:#}, but for the client the
                // error message is enough
-                error!("error reading relation or page version: {:?}", e);
+                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
                PagestreamBeMessage::Error(PagestreamErrorResponse {
                    message: e.to_string(),
                })
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            pgb.flush().await?;
+            self.flush_cancellable(pgb).await?;
        }
        Ok(())
    }
@@ -486,9 +543,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

-        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -541,8 +598,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        pgb.flush().await?;
-        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+        self.flush_cancellable(pgb).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -627,7 +684,6 @@ impl PageServerHandler {
        Ok(lsn)
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
    async fn handle_get_rel_exists_request(
        &self,
        timeline: &Timeline,
@@ -648,7 +704,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
    async fn handle_get_nblocks_request(
        &self,
        timeline: &Timeline,
@@ -667,7 +722,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
    async fn handle_db_size_request(
        &self,
        timeline: &Timeline,
@@ -689,7 +743,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
    async fn handle_get_page_at_lsn_request(
        &self,
        timeline: &Timeline,
@@ -754,7 +807,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -806,7 +859,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        let basebackup_after = started
            .elapsed()
@@ -1265,7 +1318,10 @@ async fn get_active_tenant_with_timeout(
        Ok(tenant) => tenant,
        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
        Err(GetTenantError::NotActive(_)) => {
-            unreachable!("we're calling get_tenant with active=false")
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+        Err(GetTenantError::Broken(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
        }
    };
    let wait_time = Duration::from_secs(30);
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -19,6 +19,7 @@ use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
+use std::ops::ControlFlow;
 use std::ops::Range;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
@@ -370,7 +371,6 @@ impl Timeline {
        }
    }

-    ///
    /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
    /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
    ///
@@ -385,6 +385,50 @@ impl Timeline {
        found_larger: &mut bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
+        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+            if timestamp >= search_timestamp {
+                *found_larger = true;
+                return ControlFlow::Break(true);
+            } else {
+                *found_smaller = true;
+            }
+            ControlFlow::Continue(())
+        })
+        .await
+    }
+
+    /// Obtain the possible timestamp range for the given lsn.
+    ///
+    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
+    pub async fn get_timestamp_for_lsn(
+        &self,
+        probe_lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Option<TimestampTz>, PageReconstructError> {
+        let mut max: Option<TimestampTz> = None;
+        self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
+            if let Some(max_prev) = max {
+                max = Some(max_prev.max(timestamp));
+            } else {
+                max = Some(timestamp);
+            }
+            ControlFlow::Continue(())
+        })
+        .await?;
+
+        Ok(max)
+    }
+
+    /// Runs the given function on all the timestamps for a given lsn
+    ///
+    /// The return value is either given by the closure, or set to the `Default`
+    /// impl's output.
+    async fn map_all_timestamps<T: Default>(
+        &self,
+        probe_lsn: Lsn,
+        ctx: &RequestContext,
+        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
+    ) -> Result<T, PageReconstructError> {
        for segno in self
            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
            .await?
@@ -402,16 +446,14 @@ impl Timeline {
                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);

-                    if timestamp >= search_timestamp {
-                        *found_larger = true;
-                        return Ok(true);
-                    } else {
-                        *found_smaller = true;
+                    match f(timestamp) {
+                        ControlFlow::Break(b) => return Ok(b),
+                        ControlFlow::Continue(()) => (),
                    }
                }
            }
        }
-        Ok(false)
+        Ok(Default::default())
    }

    /// Get a list of SLRU segments
@@ -499,6 +541,23 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

+    pub async fn list_aux_files(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
+        match self.get(AUX_FILES_KEY, lsn, ctx).await {
+            Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
+                Ok(dir) => Ok(dir.files),
+                Err(e) => Err(PageReconstructError::from(e)),
+            },
+            Err(e) => {
+                warn!("Failed to get info about AUX files: {}", e);
+                Ok(HashMap::new())
+            }
+        }
+    }
+
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -616,6 +675,7 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
+        result.add_key(AUX_FILES_KEY);

        Ok(result.to_keyspace())
    }
@@ -692,6 +752,12 @@ impl<'a> DatadirModification<'a> {
        })?;
        self.put(DBDIR_KEY, Value::Image(buf.into()));

+        // Create AuxFilesDirectory
+        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+            files: HashMap::new(),
+        })?;
+        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+
        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
        })?;
@@ -796,6 +862,12 @@ impl<'a> DatadirModification<'a> {
            // 'true', now write the updated 'dbdirs' map back.
            let buf = DbDirectory::ser(&dbdir)?;
            self.put(DBDIR_KEY, Value::Image(buf.into()));
+
+            // Create AuxFilesDirectory as well
+            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+                files: HashMap::new(),
+            })?;
+            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1120,6 +1192,36 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    pub async fn put_file(
+        &mut self,
+        path: &str,
+        content: &[u8],
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
+            Ok(buf) => AuxFilesDirectory::des(&buf)?,
+            Err(e) => {
+                warn!("Failed to get info about AUX files: {}", e);
+                AuxFilesDirectory {
+                    files: HashMap::new(),
+                }
+            }
+        };
+        let path = path.to_string();
+        if content.is_empty() {
+            dir.files.remove(&path);
+        } else {
+            dir.files.insert(path, Bytes::copy_from_slice(content));
+        }
+        self.put(
+            AUX_FILES_KEY,
+            Value::Image(Bytes::from(
+                AuxFilesDirectory::ser(&dir).context("serialize")?,
+            )),
+        );
+        Ok(())
+    }
+
    ///
    /// Flush changes accumulated so far to the underlying repository.
    ///
@@ -1255,6 +1357,11 @@ struct RelDirectory {
    rels: HashSet<(Oid, u8)>,
 }

+#[derive(Debug, Serialize, Deserialize, Default)]
+struct AuxFilesDirectory {
+    files: HashMap<String, Bytes>,
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
    nblocks: u32,
@@ -1303,10 +1410,12 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 // 02 pg_twophase
 //
 // 03 misc
-//    controlfile
+//    Controlfile
 //    checkpoint
 //    pg_version
 //
+// 04 aux files
+//
 // Below is a full list of the keyspace allocation:
 //
 // DbDir:
@@ -1344,6 +1453,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 //
 // Checkpoint:
 // 03 00000000 00000000 00000000 00   00000001
+//
+// AuxFiles:
+// 03 00000000 00000000 00000000 00   00000002
+//
+
 //-- Section 01: relation data and metadata

 const DBDIR_KEY: Key = Key {
@@ -1567,6 +1681,15 @@ const CHECKPOINT_KEY: Key = Key {
    field6: 1,
 };

+const AUX_FILES_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 2,
+};
+
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -1,6 +1,6 @@
 //! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.

-use std::path::Path;
+use camino::Utf8Path;

 pub enum Statvfs {
    Real(nix::sys::statvfs::Statvfs),
@@ -12,11 +12,13 @@ pub enum Statvfs {
 // Sincce it should only be a problem on > 2TiB disks, let's ignore
 // the problem for now and upcast to u64.
 impl Statvfs {
-    pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
+    pub fn get(tenants_dir: &Utf8Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
        if let Some(mocked) = mocked {
            Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
        } else {
-            Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
+            Ok(Statvfs::Real(nix::sys::statvfs::statvfs(
+                tenants_dir.as_std_path(),
+            )?))
        }
    }

@@ -55,8 +57,8 @@ impl Statvfs {

 pub mod mock {
    use anyhow::Context;
+    use camino::Utf8Path;
    use regex::Regex;
-    use std::path::Path;
    use tracing::log::info;

    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -86,7 +88,7 @@ pub mod mock {
        }
    }

-    pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
+    pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
        info!("running mocked statvfs");

        match behavior {
@@ -119,7 +121,7 @@ pub mod mock {
        }
    }

-    fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
+    fn walk_dir_disk_usage(path: &Utf8Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
        let mut total = 0;
        for entry in walkdir::WalkDir::new(path) {
            let entry = entry?;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,14 +238,14 @@ mod tests {
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        let temp_dir = tempfile::tempdir()?;
-        let path = temp_dir.path().join("file");
+        let temp_dir = camino_tempfile::tempdir()?;
+        let pathbuf = temp_dir.path().join("file");
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(&path).await?;
+            let file = VirtualFile::create(pathbuf.as_path()).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let offs = wtr.write_blob(blob).await?;
@@ -258,7 +258,7 @@ mod tests {
            wtr.flush_buffer().await?;
        }

-        let file = VirtualFile::open(&path).await?;
+        let file = VirtualFile::open(pathbuf.as_path()).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new(rdr);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -186,26 +186,21 @@ impl FileBlockReader {
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
-        loop {
-            match cache
-                .read_immutable_buf(self.file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        format!("Failed to read immutable buf: {e:#}"),
-                    )
-                })? {
-                ReadBufResult::Found(guard) => break Ok(guard.into()),
-                ReadBufResult::NotFound(mut write_guard) => {
-                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
-                    write_guard.mark_valid();
-
-                    // Swap for read lock
-                    continue;
-                }
-            };
+        match cache
+            .read_immutable_buf(self.file_id, blknum, ctx)
+            .await
+            .map_err(|e| {
+                std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    format!("Failed to read immutable buf: {e:#}"),
+                )
+            })? {
+            ReadBufResult::Found(guard) => Ok(guard.into()),
+            ReadBufResult::NotFound(mut write_guard) => {
+                // Read the page from disk into the buffer
+                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                Ok(write_guard.mark_valid().into())
+            }
        }
    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -13,6 +13,7 @@ use pageserver_api::models;
 use serde::{Deserialize, Serialize};
 use std::num::NonZeroU64;
 use std::time::Duration;
+use utils::generation::Generation;

 pub mod defaults {
    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
@@ -44,7 +45,211 @@ pub mod defaults {
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }

-/// Per-tenant configuration options
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) enum AttachmentMode {
+    /// Our generation is current as far as we know, and as far as we know we are the only attached
+    /// pageserver.  This is the "normal" attachment mode.
+    Single,
+    /// Our generation number is current as far as we know, but we are advised that another
+    /// pageserver is still attached, and therefore to avoid executing deletions.   This is
+    /// the attachment mode of a pagesever that is the destination of a migration.
+    Multi,
+    /// Our generation number is superseded, or about to be superseded.  We are advised
+    /// to avoid remote storage writes if possible, and to avoid sending billing data.  This
+    /// is the attachment mode of a pageserver that is the origin of a migration.
+    Stale,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) struct AttachedLocationConfig {
+    pub(crate) generation: Generation,
+    pub(crate) attach_mode: AttachmentMode,
+    // TODO: add a flag to override AttachmentMode's policies under
+    // disk pressure (i.e. unblock uploads under disk pressure in Stale
+    // state, unblock deletions after timeout in Multi state)
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) struct SecondaryLocationConfig {
+    /// If true, keep the local cache warm by polling remote storage
+    pub(crate) warm: bool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) enum LocationMode {
+    Attached(AttachedLocationConfig),
+    Secondary(SecondaryLocationConfig),
+}
+
+/// Per-tenant, per-pageserver configuration.  All pageservers use the same TenantConf,
+/// but have distinct LocationConf.
+#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct LocationConf {
+    /// The location-specific part of the configuration, describes the operating
+    /// mode of this pageserver for this tenant.
+    pub(crate) mode: LocationMode,
+    /// The pan-cluster tenant configuration, the same on all locations
+    pub(crate) tenant_conf: TenantConfOpt,
+}
+
+impl std::fmt::Debug for LocationConf {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.mode {
+            LocationMode::Attached(conf) => {
+                write!(
+                    f,
+                    "Attached {:?}, gen={:?}",
+                    conf.attach_mode, conf.generation
+                )
+            }
+            LocationMode::Secondary(conf) => {
+                write!(f, "Secondary, warm={}", conf.warm)
+            }
+        }
+    }
+}
+
+impl AttachedLocationConfig {
+    /// Consult attachment mode to determine whether we are currently permitted
+    /// to delete layers.  This is only advisory, not required for data safety.
+    /// See [`AttachmentMode`] for more context.
+    pub(crate) fn may_delete_layers_hint(&self) -> bool {
+        // TODO: add an override for disk pressure in AttachedLocationConfig,
+        // and respect it here.
+        match &self.attach_mode {
+            AttachmentMode::Single => true,
+            AttachmentMode::Multi | AttachmentMode::Stale => {
+                // In Multi mode we avoid doing deletions because some other
+                // attached pageserver might get 404 while trying to read
+                // a layer we delete which is still referenced in their metadata.
+                //
+                // In Stale mode, we avoid doing deletions because we expect
+                // that they would ultimately fail validation in the deletion
+                // queue due to our stale generation.
+                false
+            }
+        }
+    }
+
+    /// Whether we are currently hinted that it is worthwhile to upload layers.
+    /// This is only advisory, not required for data safety.
+    /// See [`AttachmentMode`] for more context.
+    pub(crate) fn may_upload_layers_hint(&self) -> bool {
+        // TODO: add an override for disk pressure in AttachedLocationConfig,
+        // and respect it here.
+        match &self.attach_mode {
+            AttachmentMode::Single | AttachmentMode::Multi => true,
+            AttachmentMode::Stale => {
+                // In Stale mode, we avoid doing uploads because we expect that
+                // our replacement pageserver will already have started its own
+                // IndexPart that will never reference layers we upload: it is
+                // wasteful.
+                false
+            }
+        }
+    }
+}
+
+impl LocationConf {
+    /// For use when loading from a legacy configuration: presence of a tenant
+    /// implies it is in AttachmentMode::Single, which used to be the only
+    /// possible state.  This function should eventually be removed.
+    pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation,
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf,
+        }
+    }
+
+    /// For use when attaching/re-attaching: update the generation stored in this
+    /// structure.  If we were in a secondary state, promote to attached (posession
+    /// of a fresh generation implies this).
+    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
+        match &mut self.mode {
+            LocationMode::Attached(attach_conf) => {
+                attach_conf.generation = generation;
+            }
+            LocationMode::Secondary(_) => {
+                // We are promoted to attached by the control plane's re-attach response
+                self.mode = LocationMode::Attached(AttachedLocationConfig {
+                    generation,
+                    attach_mode: AttachmentMode::Single,
+                })
+            }
+        }
+    }
+
+    pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {
+        let tenant_conf = TenantConfOpt::try_from(&conf.tenant_conf)?;
+
+        fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
+            conf.generation
+                .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
+        }
+
+        let mode = match &conf.mode {
+            models::LocationConfigMode::AttachedMulti => {
+                LocationMode::Attached(AttachedLocationConfig {
+                    generation: get_generation(conf)?,
+                    attach_mode: AttachmentMode::Multi,
+                })
+            }
+            models::LocationConfigMode::AttachedSingle => {
+                LocationMode::Attached(AttachedLocationConfig {
+                    generation: get_generation(conf)?,
+                    attach_mode: AttachmentMode::Single,
+                })
+            }
+            models::LocationConfigMode::AttachedStale => {
+                LocationMode::Attached(AttachedLocationConfig {
+                    generation: get_generation(conf)?,
+                    attach_mode: AttachmentMode::Stale,
+                })
+            }
+            models::LocationConfigMode::Secondary => {
+                anyhow::ensure!(conf.generation.is_none());
+
+                let warm = conf
+                    .secondary_conf
+                    .as_ref()
+                    .map(|c| c.warm)
+                    .unwrap_or(false);
+                LocationMode::Secondary(SecondaryLocationConfig { warm })
+            }
+            models::LocationConfigMode::Detached => {
+                // Should not have been called: API code should translate this mode
+                // into a detach rather than trying to decode it as a LocationConf
+                return Err(anyhow::anyhow!("Cannot decode a Detached configuration"));
+            }
+        };
+
+        Ok(Self { mode, tenant_conf })
+    }
+}
+
+impl Default for LocationConf {
+    // TODO: this should be removed once tenant loading can guarantee that we are never
+    // loading from a directory without a configuration.
+    // => tech debt since https://github.com/neondatabase/neon/issues/1555
+    fn default() -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation: Generation::none(),
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf: TenantConfOpt::default(),
+        }
+    }
+}
+
+/// A tenant's calcuated configuration, which is the result of merging a
+/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
+///
+/// For storing and transmitting individual tenant's configuration, see
+/// TenantConfOpt.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TenantConf {
    // Flush out an inmemory layer, if it's holding WAL older than this
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -1,9 +1,7 @@
-use std::{
-    path::{Path, PathBuf},
-    sync::Arc,
-};
+use std::sync::Arc;

 use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
@@ -33,7 +31,7 @@ use super::{
 const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;

 #[derive(Debug, thiserror::Error)]
-pub enum DeleteTenantError {
+pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

@@ -62,7 +60,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Path::new("deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
 }

 async fn create_remote_delete_mark(
@@ -148,7 +146,7 @@ async fn schedule_ordered_timeline_deletions(
    Ok(already_running_deletions)
 }

-async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
+async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
    // Assert timelines dir is empty.
    if !fs_ext::is_directory_empty(timelines_path).await? {
        // Display first 10 items in directory
@@ -188,20 +186,18 @@ async fn cleanup_remaining_fs_traces(
    conf: &PageServerConf,
    tenant_id: &TenantId,
 ) -> Result<(), DeleteTenantError> {
-    let rm = |p: PathBuf, is_dir: bool| async move {
+    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
        if is_dir {
            tokio::fs::remove_dir(&p).await
        } else {
            tokio::fs::remove_file(&p).await
        }
        .or_else(fs_ext::ignore_not_found)
-        .with_context(|| {
-            let to_display = p.display();
-            format!("failed to delete {to_display}")
-        })
+        .with_context(|| format!("failed to delete {p}"))
    };

    rm(conf.tenant_config_path(tenant_id), false).await?;
+    rm(conf.tenant_location_config_path(tenant_id), false).await?;

    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
        Err(anyhow::anyhow!(
@@ -380,7 +376,7 @@ impl DeleteTenantFlow {
        Ok(())
    }

-    pub async fn should_resume_deletion(
+    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
@@ -436,7 +432,7 @@ impl DeleteTenantFlow {
        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
        if timelines_path.exists() {
-            tenant.load(init_order, ctx).await.context("load")?;
+            tenant.load(init_order, None, ctx).await.context("load")?;
        }

        Self::background(
@@ -462,7 +458,10 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant.attach(ctx).await.context("attach")?;
+        tenant
+            .attach(ctx, super::AttachMarkerMode::Expect)
+            .await
+            .context("attach")?;

        Self::background(
            guard,
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,11 +6,11 @@ use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
+use camino::Utf8PathBuf;
 use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
-use std::path::PathBuf;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
 use utils::id::{TenantId, TimelineId};
@@ -40,7 +40,9 @@ impl EphemeralFile {

        let filename = conf
            .timeline_path(&tenant_id, &timeline_id)
-            .join(PathBuf::from(format!("ephemeral-{filename_disambiguator}")));
+            .join(Utf8PathBuf::from(format!(
+                "ephemeral-{filename_disambiguator}"
+            )));

        let file = VirtualFile::open_with_options(
            &filename,
@@ -70,38 +72,32 @@ impl EphemeralFile {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.file.path.display(),
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                            .await?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        // order path before error because error is anyhow::Error => might have many contexts
+                        format!(
+                            "ephemeral file: read immutable page #{}: {}: {:#}",
+                            blknum, self.file.path, e,
+                        ),
+                    )
+                })? {
+                page_cache::ReadBufResult::Found(guard) => {
+                    return Ok(BlockLease::PageReadGuard(guard))
+                }
+                page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                    let buf: &mut [u8] = write_guard.deref_mut();
+                    debug_assert_eq!(buf.len(), PAGE_SZ);
+                    self.file
+                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                        .await?;
+                    let read_guard = write_guard.mark_valid();
+                    return Ok(BlockLease::PageReadGuard(read_guard));
+                }
+            };
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -171,7 +167,7 @@ impl EphemeralFile {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        write_guard.mark_valid();
+                                        let _ = write_guard.mark_valid();
                                        // pre-warm successful
                                    }
                                    Err(e) => {
@@ -195,7 +191,7 @@ impl EphemeralFile {
                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
                                        self.blknum,
                                        e,
-                                        self.ephemeral_file.file.path.display(),
+                                        self.ephemeral_file.file.path,
                                    ),
                                ));
                            }
@@ -258,8 +254,7 @@ impl Drop for EphemeralFile {
                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
                error!(
                    "could not remove ephemeral file '{}': {}",
-                    self.file.path.display(),
-                    e
+                    self.file.path, e
                );
            }
        }
@@ -359,8 +354,7 @@ mod tests {
        }

        // Test a large blob that spans multiple pages
-        let mut large_data = Vec::new();
-        large_data.resize(20000, 0);
+        let mut large_data = vec![0; 20000];
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data, &ctx).await?;
        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/Show More
+++ b/Show More