pageserver: remove unused Tenant::deletion_progress

deflake: test_live_reconfig_get_evictions_low_residence_... (#5926 )
- disable extra tenant - disable compaction which could try to repartition while we assert Split from #5108.
2026-05-19 14:10:37 +00:00 · 2023-11-27 16:49:56 +00:00 · 2023-11-27 15:20:54 +02:00 · 2023-11-27 12:50:19 +00:00 · 2023-11-27 12:10:23 +00:00 · 2023-11-27 09:31:20 +00:00
216 changed files with 11202 additions and 9032 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,17 +1,3 @@
-# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
-# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
-# optimizations enabled by "opt-level=1" don't affect debuggability too much.
-#
-# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
-#
-[profile.dev.package."*"]
-# Set the default for dependencies in Development mode.
-opt-level = 3
-
-[profile.dev]
-# Turn on a small amount of optimization in Development mode.
-opt-level = 1
-
 [build]
 # This is only present for local builds, as it will be overridden
 # by the RUSTDOCFLAGS env var in CI.
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,8 +17,9 @@ assignees: ''
 ## Implementation ideas


-## Tasks
- [ ]
+```[tasklist]
+### Tasks
+```


 ## Other related tasks and Epics
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -3,7 +3,7 @@
 **NB: this PR must be merged only by 'Create a merge commit'!**

 ### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
+- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
 - [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
 - [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?

--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,5 +1,7 @@
 self-hosted-runner:
  labels:
+    - arm64
+    - dev
    - gen3
    - large
    - small
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -172,10 +172,10 @@ jobs:
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
        if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

    steps:
      - name: Fix git ownership
@@ -585,10 +586,13 @@ jobs:
        id: upload-coverage-report-new
        env:
          BUCKET: neon-github-public-dev
+          # A differential coverage report is available only for PRs.
+          # (i.e. for pushes into main/release branches we have a regular coverage report)
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
-          BASELINE="$(git merge-base HEAD origin/main)"
          CURRENT="${COMMIT_SHA}"
+          BASELINE="$(git merge-base $BASE_SHA $CURRENT)"

          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info

@@ -723,6 +727,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -847,7 +852,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.5
+      VM_BUILDER_VERSION: v0.19.0

    steps:
      - name: Checkout
@@ -869,8 +874,7 @@ jobs:
      - name: Build vm image
        run: |
          ./vm-builder \
-            -enable-file-cache \
-            -cgroup-uid=postgres \
+            -spec=vm-image-spec.yaml \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -21,7 +21,10 @@ env:

 jobs:
  check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    timeout-minutes: 90
    runs-on: macos-latest

@@ -112,8 +115,182 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
  gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 6 * * 1'
  workflow_dispatch:

 jobs:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,6 +9,24 @@ refactoring, additional comments, and so forth. Let's try to raise the
 bar, and clean things up as we go. Try to leave code in a better shape
 than it was before.

+## Pre-commit hook
+
+We have a sample pre-commit hook in `pre-commit.py`.
+To set it up, run:
+
+```bash
+ln -s ../../pre-commit.py .git/hooks/pre-commit
+```
+
+This will run following checks on staged files before each commit:
+- `rustfmt`
+- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+
+There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
+and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
+
+If you want to skip the hook, run `git commit` with `--no-verify` option.
+
 ## Submitting changes

 1. Get at least one +1 on your PR before you push.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,6 @@ members = [
    "compute_tools",
    "control_plane",
    "pageserver",
-    "pageserver/compaction",
    "pageserver/ctl",
    "proxy",
    "safekeeper",
@@ -37,7 +36,8 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
+arc-swap = "1.6"
+async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 azure_core = "0.16"
 azure_identity = "0.16"
 azure_storage = "0.16"
@@ -48,6 +48,7 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
+aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
@@ -66,7 +67,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
+dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -82,7 +83,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
@@ -125,6 +126,7 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "2.0"
+serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
@@ -134,6 +136,7 @@ strum_macros = "0.24"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
+task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
@@ -162,11 +165,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -176,7 +179,6 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
-pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
@@ -204,7 +206,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ################# Binary contents sections

--- a/5
+++ b/5
@@ -27,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,3 +38,4 @@ toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
+bytes = "1.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,7 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
+//!             -r http://pg-ext-s3-gateway
 //! ```
 //!
 use std::collections::HashMap;
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::{get_pg_version, init_remote_storage};
+use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -60,7 +60,7 @@ use compute_tools::spec::*;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "5670669815";
+const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -74,10 +74,18 @@ fn main() -> Result<()> {
    let pgbin_default = String::from("postgres");
    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);

-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
-    let ext_remote_storage = remote_ext_config.map(|x| {
-        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
-    });
+    let ext_remote_storage = matches
+        .get_one::<String>("remote-ext-config")
+        // Compatibility hack: if the control plane specified any remote-ext-config
+        // use the default value for extension storage proxy gateway.
+        // Remove this once the control plane is updated to pass the gateway URL
+        .map(|conf| {
+            if conf.starts_with("http") {
+                conf.trim_end_matches('/')
+            } else {
+                "http://pg-ext-s3-gateway"
+            }
+        });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -198,7 +206,7 @@ fn main() -> Result<()> {
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage,
+        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -479,13 +487,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
-            Arg::new("file-cache-on-disk")
-                .long("file-cache-on-disk")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -25,7 +25,7 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
@@ -59,8 +59,8 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
-    ///  the S3 bucket that we search for extensions in
-    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    /// the address of extension storage proxy gateway
+    pub ext_remote_storage: Option<String>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -698,6 +698,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;

        // 'Close' connection
@@ -710,8 +711,12 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
        Ok(())
    }

@@ -724,9 +729,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
@@ -738,6 +743,7 @@ impl ComputeNode {
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
+            handle_extension_neon(&mut client)?;
        }

        // 'Close' connection
@@ -951,12 +957,12 @@ LIMIT 100",
        real_ext_name: String,
        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
-        let remote_storage = self
-            .ext_remote_storage
-            .as_ref()
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "Remote extensions storage is not configured",
-            )))?;
+        let ext_remote_storage =
+            self.ext_remote_storage
+                .as_ref()
+                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                    "Remote extensions storage is not configured",
+                )))?;

        let ext_archive_name = ext_path.object_name().expect("bad path");

@@ -1012,7 +1018,7 @@ LIMIT 100",
        let download_size = extension_server::download_extension(
            &real_ext_name,
            &ext_path,
-            remote_storage,
+            ext_remote_storage,
            &self.pgbin,
        )
        .await
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,18 +71,16 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::Context;
 use anyhow::{self, Result};
+use anyhow::{bail, Context};
+use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
 use regex::Regex;
 use remote_storage::*;
-use serde_json;
-use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
+use reqwest::StatusCode;
 use std::path::Path;
 use std::str;
 use tar::Archive;
-use tokio::io::AsyncReadExt;
 use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
@@ -133,67 +131,36 @@ fn parse_pg_version(human_version: &str) -> &str {
    panic!("Unsuported postgres version {human_version}");
 }

-#[cfg(test)]
-mod tests {
-    use super::parse_pg_version;
-
-    #[test]
-    fn test_parse_pg_version() {
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            "v15"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            "v14"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_unsupported_version() {
-        parse_pg_version("PostgreSQL 13.14");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_incorrect_version_format() {
-        parse_pg_version("PostgreSQL 14");
-    }
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    remote_storage: &GenericRemoteStorage,
+    ext_remote_storage: &str,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);
-    let mut download = remote_storage.download(ext_path).await?;
-    let mut download_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut download_buffer)
-        .await?;
+
+    // TODO add retry logic
+    let download_buffer =
+        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
+            Ok(buffer) => buffer,
+            Err(error_message) => {
+                return Err(anyhow::anyhow!(
+                    "error downloading extension {:?}: {:?}",
+                    ext_name,
+                    error_message
+                ));
+            }
+        };
+
    let download_size = download_buffer.len() as u64;
+    info!("Download size {:?}", download_size);
    // it's unclear whether it is more performant to decompress into memory or not
    // TODO: decompressing into memory can be avoided
-    let mut decoder = Decoder::new(download_buffer.as_slice())?;
-    let mut decompress_buffer = Vec::new();
-    decoder.read_to_end(&mut decompress_buffer)?;
-    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let decoder = Decoder::new(download_buffer.as_ref())?;
+    let mut archive = Archive::new(decoder);
+
    let unzip_dest = pgbin
        .strip_suffix("/bin/postgres")
        .expect("bad pgbin")
@@ -261,29 +228,69 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    }
 }

-// This function initializes the necessary structs to use remote storage
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
-    #[derive(Debug, serde::Deserialize)]
-    struct RemoteExtJson {
-        bucket: String,
-        region: String,
-        endpoint: Option<String>,
-        prefix: Option<String>,
-    }
-    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+// Do request to extension storage proxy, i.e.
+// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
+// using HHTP GET
+// and return the response body as bytes
+//
+async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
+    let uri = format!("{}/{}", ext_remote_storage, ext_path);

-    let config = S3Config {
-        bucket_name: remote_ext_json.bucket,
-        bucket_region: remote_ext_json.region,
-        prefix_in_bucket: remote_ext_json.prefix,
-        endpoint: remote_ext_json.endpoint,
-        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
-        max_keys_per_list_response: None,
-    };
-    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
-        storage: RemoteStorageKind::AwsS3(config),
-    };
-    GenericRemoteStorage::from_config(&config)
+    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+
+    let resp = reqwest::get(uri).await?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.bytes().await {
+            Ok(resp) => {
+                info!("Download extension {:?} completed successfully", ext_path);
+                Ok(resp)
+            }
+            Err(e) => bail!("could not deserialize remote extension response: {}", e),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
+        _ => bail!(
+            "unexpected remote extension response status code: {}",
+            resp.status()
+        ),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        // download extension files from S3 on demand
+        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -674,3 +674,30 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>

    Ok(())
 }
+
+/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
+#[instrument(skip_all)]
+pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
+    info!("handle extension neon");
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
+    client.simple_query(query)?;
+
+    query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
+    info!("create neon extension with query: {}", query);
+    client.simple_query(query)?;
+
+    query = "ALTER EXTENSION neon SET SCHEMA neon";
+    info!("alter neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    // this will be a no-op if extension is already up to date,
+    // which may happen in two cases:
+    // - extension was just installed
+    // - extension was already installed and is up to date
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -2,7 +2,6 @@ use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
 use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
 use utils::id::{NodeId, TenantId};

@@ -10,14 +9,13 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
+    client: reqwest::blocking::Client,
 }

 const COMMAND: &str = "attachment_service";

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    pub node_id: Option<NodeId>,
 }
@@ -27,6 +25,16 @@ pub struct AttachHookResponse {
    pub gen: Option<u32>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct InspectRequest {
+    pub tenant_id: TenantId,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct InspectResponse {
+    pub attachment: Option<(u32, NodeId)>,
+}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = env.base_data_dir.join("attachments.json");
@@ -45,6 +53,9 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
+            client: reqwest::blocking::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
        }
    }

@@ -87,16 +98,13 @@ impl AttachmentService {
            .unwrap()
            .join("attach-hook")
            .unwrap();
-        let client = reqwest::blocking::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");

        let request = AttachHookRequest {
            tenant_id,
            node_id: Some(pageserver_id),
        };

-        let response = client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }
@@ -104,4 +112,26 @@ impl AttachmentService {
        let response = response.json::<AttachHookResponse>()?;
        Ok(response.gen)
    }
+
+    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+        use hyper::StatusCode;
+
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("inspect")
+            .unwrap();
+
+        let request = InspectRequest { tenant_id };
+
+        let response = self.client.post(url).json(&request).send()?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }
+
+        let response = response.json::<InspectResponse>()?;
+        Ok(response.attachment)
+    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY
+    // SAFETY:
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -32,7 +32,9 @@ use pageserver_api::control_api::{
    ValidateResponseTenant,
 };

-use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
+use control_plane::attachment_service::{
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
+};

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -255,12 +257,28 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    )
 }

+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let locked = state.write().await;
+    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
+
+    json_response(
+        StatusCode::OK,
+        InspectResponse {
+            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
+        },
+    )
+}
+
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
        .post("/re-attach", |r| request_span(r, handle_re_attach))
        .post("/validate", |r| request_span(r, handle_validate))
        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
 }

 #[tokio::main]
@@ -268,6 +286,7 @@ async fn main() -> anyhow::Result<()> {
    logging::init(
        LogFormat::Plain,
        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
    )?;

    let args = Cli::parse();
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -11,13 +11,14 @@ use compute_api::spec::ComputeMode;
 use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
-use control_plane::pageserver::PageServerNode;
+use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
-    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
 use safekeeper_api::{
@@ -46,8 +47,8 @@ const DEFAULT_PG_VERSION: &str = "15";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";

-fn default_conf() -> String {
-    format!(
+fn default_conf(num_pageservers: u16) -> String {
+    let mut template = format!(
        r#"
 # Default built-in configuration, defined in main.rs
 control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
@@ -55,21 +56,33 @@ control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[[pageservers]]
-id = {DEFAULT_PAGESERVER_ID}
-listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
-listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-
 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}

 "#,
-        trust_auth = AuthType::Trust,
-    )
+    );
+
+    for i in 0..num_pageservers {
+        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+
+        template += &format!(
+            r#"
+[[pageservers]]
+id = {pageserver_id}
+listen_pg_addr = '127.0.0.1:{pg_port}'
+listen_http_addr = '127.0.0.1:{http_port}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'
+"#,
+            trust_auth = AuthType::Trust,
+        )
+    }
+
+    template
 }

 ///
@@ -295,6 +308,9 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }

 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
+    let num_pageservers = init_match
+        .get_one::<u16>("num-pageservers")
+        .expect("num-pageservers arg has a default");
    // Create config file
    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
@@ -306,7 +322,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        })?
    } else {
        // Built-in default config
-        default_conf()
+        default_conf(*num_pageservers)
    };

    let pg_version = init_match
@@ -320,6 +336,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

+    // Create remote storage location for default LocalFs remote storage
+    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+
    // Initialize pageserver, create initial tenant and timeline.
    for ps_conf in &env.pageservers {
        PageServerNode::from_env(&env, ps_conf)
@@ -433,6 +452,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            migrate_tenant(env, tenant_id, new_pageserver)?;
+            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
+        }
+
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -459,8 +487,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info =
-                pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
+            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                new_timeline_id_opt,
+                None,
+                None,
+                Some(pg_version),
+            )?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -867,20 +902,20 @@ fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Res
    }
 }

+fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+    let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+        NodeId(id_str.parse().context("while parsing pageserver id")?)
+    } else {
+        DEFAULT_PAGESERVER_ID
+    };
+
+    Ok(PageServerNode::from_env(
+        env,
+        env.get_pageserver_conf(node_id)?,
+    ))
+}
+
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
-        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
-            NodeId(id_str.parse().context("while parsing pageserver id")?)
-        } else {
-            DEFAULT_PAGESERVER_ID
-        };
-
-        Ok(PageServerNode::from_env(
-            env,
-            env.get_pageserver_conf(node_id)?,
-        ))
-    }
-
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
            if let Err(e) = get_pageserver(env, subcommand_args)?
@@ -917,6 +952,20 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
            }
        }

+        Some(("migrate", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status() {
                Ok(_) => println!("Page server is up and running"),
@@ -1203,7 +1252,7 @@ fn cli() -> Command {
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
-        .help("Configure the S3 bucket that we search for extensions in.")
+        .help("Configure the remote extensions storage proxy gateway to request for extensions.")
        .required(false);

    let lsn_arg = Arg::new("lsn")
@@ -1224,6 +1273,13 @@ fn cli() -> Command {
        .help("Force initialization even if the repository is not empty")
        .required(false);

+    let num_pageservers_arg = Arg::new("num-pageservers")
+        .value_parser(value_parser!(u16))
+        .long("num-pageservers")
+        .help("How many pageservers to create (default 1)")
+        .required(false)
+        .default_value("1");
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1231,6 +1287,7 @@ fn cli() -> Command {
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
                .arg(pageserver_config_args.clone())
+                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
@@ -1258,6 +1315,7 @@ fn cli() -> Command {
            .subcommand(Command::new("create")
                .about("Create a new blank timeline")
                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
                .arg(branch_name_arg.clone())
                .arg(pg_version_arg.clone())
            )
@@ -1301,6 +1359,10 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -45,8 +45,8 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::spec::RemoteExtSpec;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -57,13 +57,10 @@ use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
-#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
    endpoint_id: String,
-    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
    pg_port: u16,
@@ -480,6 +477,18 @@ impl Endpoint {
            }
        }

+        // check for file remote_extensions_spec.json
+        // if it is present, read it and pass to compute_ctl
+        let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
+        let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
+        let remote_extensions: Option<RemoteExtSpec>;
+
+        if let Ok(spec_file) = remote_extensions_spec {
+            remote_extensions = serde_json::from_reader(spec_file).ok();
+        } else {
+            remote_extensions = None;
+        };
+
        // Create spec file
        let spec = ComputeSpec {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
@@ -501,7 +510,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            remote_extensions: None,
+            remote_extensions,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,11 +1,10 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod attachment_service;
 mod background_process;
@@ -15,3 +14,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod tenant_migration;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context};
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
 use std::env;
 use std::fs;
@@ -33,7 +32,6 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
@@ -59,7 +57,6 @@ pub struct LocalEnv {
    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub default_tenant_id: Option<TenantId>,

    // used to issue tokens during e.g pg start
@@ -84,7 +81,6 @@ pub struct LocalEnv {
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
-    #[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,7 +15,10 @@ use std::{io, result};

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
+};
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -31,6 +34,9 @@ use utils::{
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
+
 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
    #[error("Reqwest error: {0}")]
@@ -98,8 +104,10 @@ impl PageServerNode {
        }
    }

-    // pageserver conf overrides defined by neon_local configuration.
-    fn neon_local_overrides(&self) -> Vec<String> {
+    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
+    ///
+    /// These all end up on the command line of the `pageserver` binary.
+    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
@@ -132,12 +140,25 @@ impl PageServerNode {
            ));
        }

+        if !cli_overrides
+            .iter()
+            .any(|c| c.starts_with("remote_storage"))
+        {
+            overrides.push(format!(
+                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
+            ));
+        }
+
        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
+
+        // Apply the user-provided overrides
+        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
+
        overrides
    }

@@ -203,9 +224,6 @@ impl PageServerNode {
    }

    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
-
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
@@ -248,8 +266,7 @@ impl PageServerNode {
    ) -> Vec<Cow<'a, str>> {
        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];

-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+        let overrides = self.neon_local_overrides(config_overrides);
        for config_override in overrides {
            args.push(Cow::Borrowed("-c"));
            args.push(Cow::Owned(config_override));
@@ -345,11 +362,6 @@ impl PageServerNode {
                .remove("compaction_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            compaction_algorithm: settings
-                .remove("compaction_algorithm")
-                .map(serde_json::from_str)
-                .transpose()
-                .context("Failed to parse 'compaction_algorithm' json")?,
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
@@ -397,7 +409,7 @@ impl PageServerNode {
        };

        let request = models::TenantCreateRequest {
-            new_tenant_id,
+            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
            config,
        };
@@ -445,11 +457,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'compaction_threshold' as an integer")?,
-                compaction_algorithm: settings
-                    .remove("compactin_algorithm")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'compaction_algorithm' json")?,
                gc_horizon: settings
                    .remove("gc_horizon")
                    .map(|x| x.parse::<u64>())
@@ -511,6 +518,27 @@ impl PageServerNode {
        Ok(())
    }

+    pub fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+    ) -> anyhow::Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+
+        self.http_request(
+            Method::PUT,
+            format!(
+                "{}/tenant/{}/location_config",
+                self.http_base_url, tenant_id
+            ),
+        )?
+        .json(&req_body)
+        .send()?
+        .error_from_body()?;
+
+        Ok(())
+    }
+
    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -0,0 +1,202 @@
+//!
+//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
+//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
+//! point to the new pageserver.
+//!
+use crate::local_env::LocalEnv;
+use crate::{
+    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
+    pageserver::PageServerNode,
+};
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use std::collections::HashMap;
+use std::time::Duration;
+use utils::{
+    generation::Generation,
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+/// Given an attached pageserver, retrieve the LSN for all timelines
+fn get_lsns(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+    let timelines = pageserver.timeline_list(&tenant_id)?;
+    Ok(timelines
+        .into_iter()
+        .map(|t| (t.timeline_id, t.last_record_lsn))
+        .collect())
+}
+
+/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
+/// `baseline`.
+fn await_lsn(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+    baseline: HashMap<TimelineId, Lsn>,
+) -> anyhow::Result<()> {
+    loop {
+        let latest = match get_lsns(tenant_id, pageserver) {
+            Ok(l) => l,
+            Err(e) => {
+                println!(
+                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    pageserver.conf.id
+                );
+                std::thread::sleep(Duration::from_millis(500));
+                continue;
+            }
+        };
+
+        let mut any_behind: bool = false;
+        for (timeline_id, baseline_lsn) in &baseline {
+            match latest.get(timeline_id) {
+                Some(latest_lsn) => {
+                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                    if latest_lsn < baseline_lsn {
+                        any_behind = true;
+                    }
+                }
+                None => {
+                    // Expected timeline isn't yet visible on migration destination.
+                    // (IRL we would have to account for timeline deletion, but this
+                    //  is just test helper)
+                    any_behind = true;
+                }
+            }
+        }
+
+        if !any_behind {
+            println!("✅ LSN caught up.  Proceeding...");
+            break;
+        } else {
+            std::thread::sleep(Duration::from_millis(500));
+        }
+    }
+
+    Ok(())
+}
+
+/// This function spans multiple services, to demonstrate live migration of a tenant
+/// between pageservers:
+///  - Coordinate attach/secondary/detach on pageservers
+///  - call into attachment_service for generations
+///  - reconfigure compute endpoints to point to new attached pageserver
+pub fn migrate_tenant(
+    env: &LocalEnv,
+    tenant_id: TenantId,
+    dest_ps: PageServerNode,
+) -> anyhow::Result<()> {
+    // Get a new generation
+    let attachment_service = AttachmentService::from_env(env);
+
+    let previous = attachment_service.inspect(tenant_id)?;
+    let mut baseline_lsns = None;
+    if let Some((generation, origin_ps_id)) = &previous {
+        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
+
+        if origin_ps_id == &dest_ps.conf.id {
+            println!("🔁 Already attached to {origin_ps_id}, freshening...");
+            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let dest_conf = LocationConfig {
+                mode: LocationConfigMode::AttachedSingle,
+                generation: gen.map(Generation::new),
+                secondary_conf: None,
+                tenant_conf: TenantConfig::default(),
+            };
+            dest_ps.location_config(tenant_id, dest_conf)?;
+            println!("✅ Migration complete");
+            return Ok(());
+        }
+
+        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
+
+        let stale_conf = LocationConfig {
+            mode: LocationConfigMode::AttachedStale,
+            generation: Some(Generation::new(*generation)),
+            secondary_conf: None,
+            tenant_conf: TenantConfig::default(),
+        };
+        origin_ps.location_config(tenant_id, stale_conf)?;
+
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+    }
+
+    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let dest_conf = LocationConfig {
+        mode: LocationConfigMode::AttachedMulti,
+        generation: gen.map(Generation::new),
+        secondary_conf: None,
+        tenant_conf: TenantConfig::default(),
+    };
+
+    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
+    dest_ps.location_config(tenant_id, dest_conf)?;
+
+    if let Some(baseline) = baseline_lsns {
+        println!("🕑 Waiting for LSN to catch up...");
+        await_lsn(tenant_id, &dest_ps, baseline)?;
+    }
+
+    let cplane = ComputeControlPlane::load(env.clone())?;
+    for (endpoint_name, endpoint) in &cplane.endpoints {
+        if endpoint.tenant_id == tenant_id {
+            println!(
+                "🔁 Reconfiguring endpoint {} to use pageserver {}",
+                endpoint_name, dest_ps.conf.id
+            );
+            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+        }
+    }
+
+    for other_ps_conf in &env.pageservers {
+        if other_ps_conf.id == dest_ps.conf.id {
+            continue;
+        }
+
+        let other_ps = PageServerNode::from_env(env, other_ps_conf);
+        let other_ps_tenants = other_ps.tenant_list()?;
+
+        // Check if this tenant is attached
+        let found = other_ps_tenants
+            .into_iter()
+            .map(|t| t.id)
+            .any(|i| i == tenant_id);
+        if !found {
+            continue;
+        }
+
+        // Downgrade to a secondary location
+        let secondary_conf = LocationConfig {
+            mode: LocationConfigMode::Secondary,
+            generation: None,
+            secondary_conf: Some(LocationConfigSecondary { warm: true }),
+            tenant_conf: TenantConfig::default(),
+        };
+
+        println!(
+            "💤 Switching to secondary mode on pageserver {}",
+            other_ps.conf.id
+        );
+        other_ps.location_config(tenant_id, secondary_conf)?;
+    }
+
+    println!(
+        "🔁 Switching to AttachedSingle mode on pageserver {}",
+        dest_ps.conf.id
+    );
+    let dest_conf = LocationConfig {
+        mode: LocationConfigMode::AttachedSingle,
+        generation: gen.map(Generation::new),
+        secondary_conf: None,
+        tenant_conf: TenantConfig::default(),
+    };
+    dest_ps.location_config(tenant_id, dest_conf)?;
+
+    println!("✅ Migration complete");
+
+    Ok(())
+}
--- a/deny.toml
+++ b/deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []

+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -177,7 +177,7 @@ I e during migration create_branch can be called on old pageserver and newly cre

 The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.

-The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+The approach largely follows this guide: <https://www.notion.so/neondatabase/Cloud-Ad-hoc-tenant-relocation-f687474f7bfc42269e6214e3acba25c7>

 The happy path sequence:

--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -6,7 +6,6 @@
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -19,7 +18,6 @@ pub type PgIdent = String;

 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
-#[serde_as]
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,
@@ -50,12 +48,12 @@ pub struct ComputeSpec {
    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
    // updated to fill these fields, we can make these non optional.
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
+
    pub pageserver_connstring: Option<String>,
+
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -140,14 +138,13 @@ impl RemoteExtSpec {
    }
 }

-#[serde_as]
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeMode {
    /// A read-write node
    #[default]
    Primary,
    /// A read-only node, pinned at a particular LSN
-    Static(#[serde_as(as = "DisplayFromStr")] Lsn),
+    Static(Lsn),
    /// A read-only node that follows the tip of the branch in hot standby mode
    ///
    /// Future versions may want to distinguish between replicas with hot standby
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,6 +2,7 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -17,5 +17,9 @@ postgres_ffi.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+hex.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+bincode.workspace = true
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,7 +4,6 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{NodeId, TenantId};

 #[derive(Serialize, Deserialize)]
@@ -12,10 +11,8 @@ pub struct ReAttachRequest {
    pub node_id: NodeId,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -25,10 +22,8 @@ pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub gen: u32,
 }
@@ -43,10 +38,8 @@ pub struct ValidateResponse {
    pub tenants: Vec<ValidateResponseTenant>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -0,0 +1,142 @@
+use anyhow::{bail, Result};
+use byteorder::{ByteOrder, BE};
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// Key used in the Repository kv-store.
+///
+/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
+/// for what we actually store in these fields.
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+pub const KEY_SIZE: usize = 18;
+
+impl Key {
+    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
+    /// As long as Neon does not support tablespace (because of lack of access to local file system),
+    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
+    pub fn to_i128(&self) -> i128 {
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        (((self.field1 & 0xf) as i128) << 120)
+            | (((self.field2 & 0xFFFF) as i128) << 104)
+            | ((self.field3 as i128) << 72)
+            | ((self.field4 as i128) << 40)
+            | ((self.field5 as i128) << 32)
+            | self.field6 as i128
+    }
+
+    pub const fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
+    pub fn next(&self) -> Key {
+        self.add(1)
+    }
+
+    pub fn add(&self, x: u32) -> Key {
+        let mut key = *self;
+
+        let r = key.field6.overflowing_add(x);
+        key.field6 = r.0;
+        if r.1 {
+            let r = key.field5.overflowing_add(1);
+            key.field5 = r.0;
+            if r.1 {
+                let r = key.field4.overflowing_add(1);
+                key.field4 = r.0;
+                if r.1 {
+                    let r = key.field3.overflowing_add(1);
+                    key.field3 = r.0;
+                    if r.1 {
+                        let r = key.field2.overflowing_add(1);
+                        key.field2 = r.0;
+                        if r.1 {
+                            let r = key.field1.overflowing_add(1);
+                            key.field1 = r.0;
+                            assert!(!r.1);
+                        }
+                    }
+                }
+            }
+        }
+        key
+    }
+
+    pub fn from_slice(b: &[u8]) -> Self {
+        Key {
+            field1: b[0],
+            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
+            field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
+            field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
+            field5: b[13],
+            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
+        }
+    }
+
+    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
+        buf[0] = self.field1;
+        BE::write_u32(&mut buf[1..5], self.field2);
+        BE::write_u32(&mut buf[5..9], self.field3);
+        BE::write_u32(&mut buf[9..13], self.field4);
+        buf[13] = self.field5;
+        BE::write_u32(&mut buf[14..18], self.field6);
+    }
+}
+
+impl fmt::Display for Key {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
+            self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
+        )
+    }
+}
+
+impl Key {
+    pub const MIN: Key = Key {
+        field1: u8::MIN,
+        field2: u32::MIN,
+        field3: u32::MIN,
+        field4: u32::MIN,
+        field5: u8::MIN,
+        field6: u32::MIN,
+    };
+    pub const MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    };
+
+    pub fn from_hex(s: &str) -> Result<Self> {
+        if s.len() != 36 {
+            bail!("parse error");
+        }
+        Ok(Key {
+            field1: u8::from_str_radix(&s[0..2], 16)?,
+            field2: u32::from_str_radix(&s[2..10], 16)?,
+            field3: u32::from_str_radix(&s[10..18], 16)?,
+            field4: u32::from_str_radix(&s[18..26], 16)?,
+            field5: u8::from_str_radix(&s[26..28], 16)?,
+            field6: u32::from_str_radix(&s[28..36], 16)?,
+        })
+    }
+}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,9 +1,13 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
 pub mod control_api;
+pub mod key;
 pub mod models;
 pub mod reltag;
+pub mod shard;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -6,7 +6,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
+use serde_with::serde_as;
 use strum_macros;
 use utils::{
    completion,
@@ -16,7 +16,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::reltag::RelTag;
+use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

@@ -174,26 +174,20 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub new_timeline_id: TimelineId,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
-    pub new_tenant_id: TenantId,
+    pub new_tenant_id: TenantShardId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generation: Option<u32>,
@@ -201,7 +195,6 @@ pub struct TenantCreateRequest {
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
 #[derive(Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLoadRequest {
@@ -227,8 +220,6 @@ pub struct TenantConfig {
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
-    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<serde_json::Value>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -280,31 +271,26 @@ pub struct LocationConfig {
    pub tenant_conf: TenantConfig,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
-pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
+pub struct TenantCreateResponse(pub TenantId);

 #[derive(Serialize)]
 pub struct StatusResponse {
    pub id: NodeId,
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -326,7 +312,6 @@ impl TenantConfigRequest {
            compaction_target_size: None,
            compaction_period: None,
            compaction_threshold: None,
-            compaction_algorithm: None,
            gc_horizon: None,
            gc_period: None,
            image_creation_threshold: None,
@@ -377,10 +362,8 @@ pub enum TenantAttachmentStatus {
    Failed { reason: String },
 }

-#[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub id: TenantId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
@@ -391,33 +374,22 @@ pub struct TenantInfo {
 }

 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,

-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub last_record_lsn: Lsn,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub prev_record_lsn: Option<Lsn>,
-    #[serde_as(as = "DisplayFromStr")]
    pub latest_gc_cutoff_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,

    /// The LSN that we have succesfully uploaded to remote storage
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,

    /// The LSN that we are advertizing to safekeepers
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn_visible: Lsn,

    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -429,7 +401,6 @@ pub struct TimelineInfo {
    pub timeline_dir_layer_file_size_sum: Option<u64>,

    pub wal_source_connstr: Option<String>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
@@ -526,23 +497,13 @@ pub struct LayerAccessStats {
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
-    Open {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-    },
-    Frozen {
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
-        lsn_end: Lsn,
-    },
+    Open { lsn_start: Lsn },
+    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }

-#[serde_as]
 #[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
@@ -550,9 +511,7 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
-        #[serde_as(as = "DisplayFromStr")]
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
@@ -561,7 +520,6 @@ pub enum HistoricLayerInfo {
        layer_file_name: String,
        layer_file_size: u64,

-        #[serde_as(as = "DisplayFromStr")]
        lsn_start: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -0,0 +1,321 @@
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+use utils::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+pub struct ShardCount(pub u8);
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> String {
+        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(
+                f,
+                "{}-{:02x}{:02x}",
+                self.tenant_id, self.shard_number.0, self.shard_count.0
+            )
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};
+
+    use super::*;
+
+    const EXAMPLE_TENANT_ID: &str = "1f359dd625e519a1a4e8d7509690f6fc";
+
+    #[test]
+    fn tenant_shard_id_string() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = format!("{example}");
+
+        let expected = format!("{EXAMPLE_TENANT_ID}-070a");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_binary() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = bincode::serialize(&example).unwrap();
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x07, 0x0a,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize(&encoded).unwrap();
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_backward_compat() -> Result<(), hex::FromHexError> {
+        // Test that TenantShardId can decode a TenantId in human
+        // readable form
+        let example = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded.tenant_id);
+        assert_eq!(decoded.shard_count, ShardCount(0));
+        assert_eq!(decoded.shard_number, ShardNumber(0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_forward_compat() -> Result<(), hex::FromHexError> {
+        // Test that a legacy TenantShardId encodes into a form that
+        // can be decoded as TenantId
+        let example_tenant_id = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let example = TenantShardId::unsharded(example_tenant_id);
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantId::from_str(&encoded)?;
+
+        assert_eq!(example_tenant_id, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_legacy_binary() -> Result<(), hex::FromHexError> {
+        // Unlike in human readable encoding, binary encoding does not
+        // do any special handling of legacy unsharded TenantIds: this test
+        // is equivalent to the main test for binary encoding, just verifying
+        // that the same behavior applies when we have used `unsharded()` to
+        // construct a TenantShardId.
+        let example = TenantShardId::unsharded(TenantId::from_str(EXAMPLE_TENANT_ID).unwrap());
+        let encoded = bincode::serialize(&example).unwrap();
+
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x00, 0x00,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize::<TenantShardId>(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+}
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -15,7 +17,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace};
+use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -33,6 +35,11 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
+    /// Authentication failure
+    #[error("Unauthorized: {0}")]
+    Unauthorized(std::borrow::Cow<'static, str>),
+    #[error("Simulated Connection Error")]
+    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -608,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
+                            &short_error(&e),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -728,12 +736,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
+                    match e {
+                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        QueryError::SimulatedConnectionError => {
+                            return Err(QueryError::SimulatedConnectionError)
+                        }
+                        e => {
+                            log_query_error(query_string, &e);
+                            let short_error = short_error(&e);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &short_error,
+                                Some(e.pg_error_code()),
+                            ))?;
+                        }
+                    }
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
@@ -959,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
+        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -975,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::SimulatedConnectionError => {
+            error!("query handler for query '{query}' failed due to a simulated connection error")
+        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
+        QueryError::Unauthorized(e) => {
+            warn!("query handler for '{query}' failed with authentication error: {e}");
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
+aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,21 +1,18 @@
 //! Azure Blob Storage wrapper

+use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::sync::Arc;
-use std::{borrow::Cow, collections::HashMap, io::Cursor};
+use std::{borrow::Cow, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::Header;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{
-    blob::operations::GetBlobBuilder,
-    prelude::{BlobClient, ContainerClient},
-};
+use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use futures_util::StreamExt;
 use http_types::StatusCode;
 use tokio::io::AsyncRead;
@@ -112,16 +109,19 @@ impl AzureBlobStorage {

    async fn download_for_builder(
        &self,
-        metadata: StorageMetadata,
        builder: GetBlobBuilder,
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

+        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
            let data = part
                .data
                .collect()
@@ -131,28 +131,9 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(metadata),
+            metadata: Some(StorageMetadata(metadata)),
        })
    }
-    // TODO get rid of this function once we have metadata included in the response
-    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
-    async fn get_metadata(
-        &self,
-        blob_client: &BlobClient,
-    ) -> Result<StorageMetadata, DownloadError> {
-        let builder = blob_client.get_metadata();
-
-        let response = builder.into_future().await.map_err(to_download_error)?;
-        let mut map = HashMap::new();
-
-        for md in response.metadata.iter() {
-            map.insert(
-                md.name().as_str().to_string(),
-                md.value().as_str().to_string(),
-            );
-        }
-        Ok(StorageMetadata(map))
-    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
        self.concurrency_limiter
@@ -269,11 +250,9 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let builder = blob_client.get();

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn download_byte_range(
@@ -285,8 +264,6 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let mut builder = blob_client.get();

        if let Some(end_exclusive) = end_exclusive {
@@ -301,7 +278,7 @@ impl RemoteStorage for AzureBlobStorage {
            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        }

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,19 +6,15 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -34,12 +30,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
-pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -441,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
-    pub max_concurrent_syncs: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -540,18 +526,6 @@ impl RemoteStorageConfig {

        let use_azure = container_name.is_some() && container_region.is_some();

-        let max_concurrent_syncs = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_syncs", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
-        )
-        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
-
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -633,11 +607,7 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig {
-            max_concurrent_syncs,
-            max_sync_errors,
-            storage,
-        }))
+        Ok(Some(RemoteStorageConfig { storage }))
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,23 +4,27 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::borrow::Cow;
+use std::{borrow::Cow, sync::Arc};

 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig,
+    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{Config, Region},
+    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
+use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -83,10 +87,23 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

+        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
+        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
+        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
+        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
+        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
+        let mut retry_config = RetryConfigBuilder::new();
+        retry_config
+            .set_max_attempts(Some(1))
+            .set_mode(Some(RetryMode::Adaptive));
+
        let mut config_builder = Config::builder()
            .region(region)
            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider);
+            .credentials_provider(credentials_provider)
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
+            .retry_config(retry_config.build());

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -281,6 +281,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -469,8 +470,6 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -210,6 +210,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -396,8 +397,6 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,23 +1,18 @@
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};

 use utils::{
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };

-#[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
-    #[serde_as(as = "DisplayFromStr")]
    pub timeline_id: TimelineId,
    pub peer_ids: Option<Vec<NodeId>>,
    pub pg_version: u32,
    pub system_id: Option<u64>,
    pub wal_seg_size: Option<u32>,
-    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    // If not passed, it is assigned to the beginning of commit_lsn segment.
    pub local_start_lsn: Option<Lsn>,
@@ -28,7 +23,6 @@ fn lsn_invalid() -> Lsn {
 }

 /// Data about safekeeper's timeline, mirrors broker.proto.
-#[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
    /// Term.
@@ -36,25 +30,19 @@ pub struct SkTimelineInfo {
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub flush_lsn: Lsn,
    /// Up to which LSN safekeeper regards its WAL as committed.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub commit_lsn: Lsn,
    /// LSN up to which safekeeper has backed WAL.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub backup_lsn: Lsn,
    /// LSN of last checkpoint uploaded by pageserver.
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub remote_consistent_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub peer_horizon_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    #[serde(default = "lsn_invalid")]
    pub local_start_lsn: Lsn,
    /// A connection string to use for WAL receiving.
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,4 +1,6 @@
 //! Synthetic size calculation
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,6 +32,8 @@
 //!         .init();
 //! }
 //! ```
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
@@ -55,6 +56,7 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
+serde_assert.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/scripts/restore_from_wal_initdb.sh
+++ b/libs/utils/scripts/restore_from_wal_initdb.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# like restore_from_wal.sh, but takes existing initdb.tar.zst
+
+set -euxo pipefail
+
+PG_BIN=$1
+WAL_PATH=$2
+DATA_DIR=$3
+PORT=$4
+echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
+REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
+declare -i WAL_SIZE=$REDO_POS+114
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
+for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
+dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+rm -f 000000010000000000000001
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,8 @@
 // For details about authentication see docs/authentication.md

+use arc_swap::ArcSwap;
 use serde;
-use std::fs;
+use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
 use camino::Utf8Path;
@@ -9,9 +10,8 @@ use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};

-use crate::id::TenantId;
+use crate::{http::error::ApiError, id::TenantId};

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -32,11 +32,9 @@ pub enum Scope {
 }

 /// JWT payload. See docs/authentication.md for the format
-#[serde_as]
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
    pub tenant_id: Option<TenantId>,
    pub scope: Scope,
 }
@@ -47,31 +45,106 @@ impl Claims {
    }
 }

+pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
+
+impl SwappableJwtAuth {
+    pub fn new(jwt_auth: JwtAuth) -> Self {
+        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
+    }
+    pub fn swap(&self, jwt_auth: JwtAuth) {
+        self.0.swap(Arc::new(jwt_auth));
+    }
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        self.0.load().decode(token)
+    }
+}
+
+impl std::fmt::Debug for SwappableJwtAuth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Swappable({:?})", self.0.load())
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct AuthError(pub Cow<'static, str>);
+
+impl Display for AuthError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 pub struct JwtAuth {
-    decoding_key: DecodingKey,
+    decoding_keys: Vec<DecodingKey>,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_key: DecodingKey) -> Self {
+    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_key,
+            decoding_keys,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
+        let metadata = key_path.metadata()?;
+        let decoding_keys = if metadata.is_dir() {
+            let mut keys = Vec::new();
+            for entry in fs::read_dir(key_path)? {
+                let path = entry?.path();
+                if !path.is_file() {
+                    // Ignore directories (don't recurse)
+                    continue;
+                }
+                let public_key = fs::read(path)?;
+                keys.push(DecodingKey::from_ed_pem(&public_key)?);
+            }
+            keys
+        } else if metadata.is_file() {
+            let public_key = fs::read(key_path)?;
+            vec![DecodingKey::from_ed_pem(&public_key)?]
+        } else {
+            anyhow::bail!("path is neither a directory or a file")
+        };
+        if decoding_keys.is_empty() {
+            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
+        }
+        Ok(Self::new(decoding_keys))
    }

-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        Ok(decode(token, &self.decoding_key, &self.validation)?)
+    /// Attempt to decode the token with the internal decoding keys.
+    ///
+    /// The function tries the stored decoding keys in succession,
+    /// and returns the first yielding a successful result.
+    /// If there is no working decoding key, it returns the last error.
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        let mut res = None;
+        for decoding_key in &self.decoding_keys {
+            res = Some(decode(token, decoding_key, &self.validation));
+            if let Some(Ok(res)) = res {
+                return Ok(res);
+            }
+        }
+        if let Some(res) = res {
+            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
+        } else {
+            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
+        }
    }
 }

@@ -111,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;

    #[test]
-    fn test_decode() -> Result<(), anyhow::Error> {
+    fn test_decode() {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

@@ -132,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
        assert_eq!(claims_from_token, expected_claims);
-
-        Ok(())
    }

    #[test]
-    fn test_encode() -> Result<(), anyhow::Error> {
+    fn test_encode() {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();

        // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let decoded = auth.decode(&encoded)?;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let decoded = auth.decode(&encoded).unwrap();

        assert_eq!(decoded.claims, claims);
-
-        Ok(())
    }
 }
--- a/libs/utils/src/hex.rs
+++ b/libs/utils/src/hex.rs
@@ -0,0 +1,41 @@
+/// Useful type for asserting that expected bytes match reporting the bytes more readable
+/// array-syntax compatible hex bytes.
+///
+/// # Usage
+///
+/// ```
+/// use utils::Hex;
+///
+/// let actual = serialize_something();
+/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
+///
+/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
+/// // output suffixed with an array style length for easier comparisons.
+/// assert_eq!(Hex(&actual), Hex(&expected));
+///
+/// // with `let expected = [0x68];` the error would had been:
+/// // assertion `left == right` failed
+/// //  left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
+/// // right: [0x68; 1]
+/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
+/// ```
+#[derive(PartialEq)]
+pub struct Hex<'a>(pub &'a [u8]);
+
+impl std::fmt::Debug for Hex<'_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for (i, c) in self.0.chunks(16).enumerate() {
+            if i > 0 && !c.is_empty() {
+                writeln!(f, ", ")?;
+            }
+            for (j, b) in c.iter().enumerate() {
+                if j > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "0x{b:02x}")?;
+            }
+        }
+        write!(f, "; {}]", self.0.len())
+    }
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, JwtAuth};
+use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth
-                        .decode(token)
-                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
+                    let data = auth.decode(token).map_err(|err| {
+                        warn!("Authentication error: {err}");
+                        // Rely on From<AuthError> for ApiError impl
+                        err
+                    })?;
                    req.set_context(data.claims);
                }
                None => {
@@ -450,12 +452,11 @@ where

 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
+    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => {
-            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
-        }
+        Some(claims) => Ok(check_permission(&claims)
+            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::{error, info, warn};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors

    match api_error {
+        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
+            warn!("Error processing HTTP request: {api_error:#}")
+        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -3,6 +3,7 @@ use std::{fmt, str::FromStr};
 use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
+use serde::de::Visitor;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;

@@ -17,12 +18,74 @@ pub enum IdError {
 ///
 /// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
 /// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
-///
-/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
-/// Check the `serde_with::serde_as` documentation for options for more complex types.
-#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
+#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
 struct Id([u8; 16]);

+impl Serialize for Id {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            self.0.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Id {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> Visitor<'de> for IdVisitor {
+            type Value = Id;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 16])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 16] = Deserialize::deserialize(s)?;
+                Ok(Id::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Id::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                16,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
 impl Id {
    pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
        let mut arr = [0u8; 16];
@@ -57,6 +120,8 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
+
+        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
@@ -308,3 +373,112 @@ impl fmt::Display for NodeId {
        write!(f, "{}", self.0)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};
+
+    use crate::bin_ser::BeSer;
+
+    use super::*;
+
+    #[test]
+    fn test_id_serde_non_human_readable() {
+        let original_id = Id([
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ]);
+        let expected_tokens = Tokens(vec![
+            Token::Tuple { len: 16 },
+            Token::U8(173),
+            Token::U8(80),
+            Token::U8(132),
+            Token::U8(115),
+            Token::U8(129),
+            Token::U8(226),
+            Token::U8(72),
+            Token::U8(254),
+            Token::U8(170),
+            Token::U8(201),
+            Token::U8(135),
+            Token::U8(108),
+            Token::U8(199),
+            Token::U8(26),
+            Token::U8(228),
+            Token::U8(24),
+            Token::TupleEnd,
+        ]);
+
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let serialized_tokens = original_id.serialize(&serializer).unwrap();
+        assert_eq!(serialized_tokens, expected_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(serialized_tokens)
+            .build();
+        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
+        assert_eq!(deserialized_id, original_id);
+    }
+
+    #[test]
+    fn test_id_serde_human_readable() {
+        let original_id = Id([
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ]);
+        let expected_tokens = Tokens(vec![Token::Str(String::from(
+            "ad50847381e248feaac9876cc71ae418",
+        ))]);
+
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let serialized_tokens = original_id.serialize(&serializer).unwrap();
+        assert_eq!(serialized_tokens, expected_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(Tokens(vec![Token::Str(String::from(
+                "ad50847381e248feaac9876cc71ae418",
+            ))]))
+            .build();
+        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
+    }
+
+    macro_rules! roundtrip_type {
+        ($type:ty, $expected_bytes:expr) => {{
+            let expected_bytes: [u8; 16] = $expected_bytes;
+            let original_id = <$type>::from(expected_bytes);
+
+            let ser_bytes = original_id.ser().unwrap();
+            assert_eq!(ser_bytes, expected_bytes);
+
+            let des_id = <$type>::des(&ser_bytes).unwrap();
+            assert_eq!(des_id, original_id);
+        }};
+    }
+
+    #[test]
+    fn test_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(Id, expected_bytes);
+    }
+
+    #[test]
+    fn test_tenant_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(TenantId, expected_bytes);
+    }
+
+    #[test]
+    fn test_timeline_id_bincode_serde() {
+        let expected_bytes = [
+            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
+        ];
+
+        roundtrip_type!(TimelineId, expected_bytes);
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,5 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

@@ -24,6 +25,10 @@ pub mod auth;

 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
+
+mod hex;
+pub use hex::Hex;
+
 // http endpoint utils
 pub mod http;

@@ -73,6 +78,9 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+/// async timeout helper
+pub mod timeout;
+
 pub mod sync;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -66,9 +66,17 @@ pub enum TracingErrorLayerEnablement {
    EnableWithRustLogFilter,
 }

+/// Where the logging should output to.
+#[derive(Clone, Copy)]
+pub enum Output {
+    Stdout,
+    Stderr,
+}
+
 pub fn init(
    log_format: LogFormat,
    tracing_error_layer_enablement: TracingErrorLayerEnablement,
+    output: Output,
 ) -> anyhow::Result<()> {
    // We fall back to printing all spans at info-level or above if
    // the RUST_LOG environment variable is not set.
@@ -85,7 +93,12 @@ pub fn init(
        let log_layer = tracing_subscriber::fmt::layer()
            .with_target(false)
            .with_ansi(false)
-            .with_writer(std::io::stdout);
+            .with_writer(move || -> Box<dyn std::io::Write> {
+                match output {
+                    Output::Stdout => Box::new(std::io::stdout()),
+                    Output::Stderr => Box::new(std::io::stderr()),
+                }
+            });
        let log_layer = match log_format {
            LogFormat::Json => log_layer.json().boxed(),
            LogFormat::Plain => log_layer.boxed(),
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,7 +1,7 @@
 #![warn(missing_docs)]

 use camino::Utf8Path;
-use serde::{Deserialize, Serialize};
+use serde::{de::Visitor, Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
 use std::str::FromStr;
@@ -13,10 +13,114 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;

 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
-#[serde(transparent)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
 pub struct Lsn(pub u64);

+impl Serialize for Lsn {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            self.0.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for Lsn {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct LsnVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> Visitor<'de> for LsnVisitor {
+            type Value = Lsn;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str(
+                        "value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
+                    )
+                } else {
+                    formatter.write_str("value in form of integer(u64)")
+                }
+            }
+
+            fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(Lsn(v))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Lsn::from_str(v).map_err(|e| E::custom(e))
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(LsnVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_u64(LsnVisitor {
+                is_human_readable_deserializer: false,
+            })
+        }
+    }
+}
+
+/// Allows (de)serialization of an `Lsn` always as `u64`.
+///
+/// ### Example
+///
+/// ```rust
+/// # use serde::{Serialize, Deserialize};
+/// use utils::lsn::Lsn;
+///
+/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
+/// struct Foo {
+///   #[serde(with = "utils::lsn::serde_as_u64")]
+///   always_u64: Lsn,
+/// }
+///
+/// let orig = Foo { always_u64: Lsn(1234) };
+///
+/// let res = serde_json::to_string(&orig).unwrap();
+/// assert_eq!(res, r#"{"always_u64":1234}"#);
+///
+/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
+/// assert_eq!(foo, orig);
+/// ```
+///
+pub mod serde_as_u64 {
+    use super::Lsn;
+
+    /// Serializes the Lsn as u64 disregarding the human readability of the format.
+    ///
+    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
+    pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
+        use serde::Serialize;
+        lsn.0.serialize(serializer)
+    }
+
+    /// Deserializes the Lsn as u64 disregarding the human readability of the format.
+    ///
+    /// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
+    pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
+        use serde::Deserialize;
+        u64::deserialize(deserializer).map(Lsn)
+    }
+}
+
 /// We tried to parse an LSN from a string, but failed
 #[derive(Debug, PartialEq, Eq, thiserror::Error)]
 #[error("LsnParseError")]
@@ -264,8 +368,13 @@ impl MonotonicCounter<Lsn> for RecordLsn {

 #[cfg(test)]
 mod tests {
+    use crate::bin_ser::BeSer;
+
    use super::*;

+    use serde::ser::Serialize;
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};
+
    #[test]
    fn test_lsn_strings() {
        assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -341,4 +450,95 @@ mod tests {
        assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
        assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
    }
+
+    #[test]
+    fn test_lsn_serde() {
+        let original_lsn = Lsn(0x0123456789abcdef);
+        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
+        let expected_non_readable_tokens =
+            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
+
+        // Testing human_readable ser/de
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+        assert_eq!(readable_ser_tokens, expected_readable_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
+            .build();
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+
+        // Testing NON human_readable ser/de
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
+            .build();
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+
+        // Testing mismatching ser/de
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
+            .build();
+        Lsn::deserialize(&mut deserializer).unwrap_err();
+
+        let serializer = Serializer::builder().is_human_readable(true).build();
+        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
+            .build();
+        Lsn::deserialize(&mut deserializer).unwrap_err();
+    }
+
+    #[test]
+    fn test_lsn_ensure_roundtrip() {
+        let original_lsn = Lsn(0xaaaabbbb);
+
+        let serializer = Serializer::builder().is_human_readable(false).build();
+        let ser_tokens = original_lsn.serialize(&serializer).unwrap();
+
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(false)
+            .tokens(ser_tokens)
+            .build();
+
+        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+    }
+
+    #[test]
+    fn test_lsn_bincode_serde() {
+        let lsn = Lsn(0x0123456789abcdef);
+        let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
+
+        let ser_bytes = lsn.ser().unwrap();
+        assert_eq!(ser_bytes, expected_bytes);
+
+        let des_lsn = Lsn::des(&ser_bytes).unwrap();
+        assert_eq!(des_lsn, lsn);
+    }
+
+    #[test]
+    fn test_lsn_bincode_ensure_roundtrip() {
+        let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
+        let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
+
+        let ser_bytes = original_lsn.ser().unwrap();
+        assert_eq!(ser_bytes, expected_bytes);
+
+        let des_lsn = Lsn::des(&ser_bytes).unwrap();
+        assert_eq!(des_lsn, original_lsn);
+    }
 }
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -3,7 +3,6 @@ use std::time::{Duration, SystemTime};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
 use tracing::{trace, warn};

 use crate::lsn::Lsn;
@@ -15,21 +14,17 @@ use crate::lsn::Lsn;
 ///
 /// serde Serialize is used only for human readable dump to json (e.g. in
 /// safekeepers debug_dump).
-#[serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct PageserverFeedback {
    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    /// LSN last received and ingested by the pageserver. Controls backpressure.
-    #[serde_as(as = "DisplayFromStr")]
    pub last_received_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver to its local disc.
    /// Controls backpressure.
-    #[serde_as(as = "DisplayFromStr")]
    pub disk_consistent_lsn: Lsn,
    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
    /// consider WAL before it can be removed.
-    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,6 +125,9 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();

+            // Block any future waiters from starting
+            internal.shutdown = true;
+
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,6 +1,7 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) {
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1 +1,3 @@
 pub mod heavier_once_cell;
+
+pub mod gate;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -0,0 +1,158 @@
+use std::{sync::Arc, time::Duration};
+
+/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
+///
+/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
+/// the resource calls `close()` when they want to ensure that all holders of guards
+/// have released them, and that no future guards will be issued.
+pub struct Gate {
+    /// Each caller of enter() takes one unit from the semaphore. In close(), we
+    /// take all the units to ensure all GateGuards are destroyed.
+    sem: Arc<tokio::sync::Semaphore>,
+
+    /// For observability only: a name that will be used to log warnings if a particular
+    /// gate is holding up shutdown
+    name: String,
+}
+
+/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
+/// not complete.
+#[derive(Debug)]
+pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
+
+/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
+async fn warn_if_stuck<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_period: std::time::Duration,
+) -> <Fut as std::future::Future>::Output {
+    let started = std::time::Instant::now();
+
+    let mut fut = std::pin::pin!(fut);
+
+    loop {
+        match tokio::time::timeout(warn_period, &mut fut).await {
+            Ok(ret) => return ret,
+            Err(_) => {
+                tracing::warn!(
+                    gate = name,
+                    elapsed_ms = started.elapsed().as_millis(),
+                    "still waiting, taking longer than expected..."
+                );
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum GateError {
+    GateClosed,
+}
+
+impl Gate {
+    const MAX_UNITS: u32 = u32::MAX;
+
+    pub fn new(name: String) -> Self {
+        Self {
+            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
+            name,
+        }
+    }
+
+    /// Acquire a guard that will prevent close() calls from completing. If close()
+    /// was already called, this will return an error which should be interpreted
+    /// as "shutting down".
+    ///
+    /// This function would typically be used from e.g. request handlers. While holding
+    /// the guard returned from this function, it is important to respect a CancellationToken
+    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
+    /// also contain a CancellationToken.
+    pub fn enter(&self) -> Result<GateGuard, GateError> {
+        self.sem
+            .clone()
+            .try_acquire_owned()
+            .map(GateGuard)
+            .map_err(|_| GateError::GateClosed)
+    }
+
+    /// Types with a shutdown() method and a gate should call this method at the
+    /// end of shutdown, to ensure that all GateGuard holders are done.
+    ///
+    /// This will wait for all guards to be destroyed.  For this to complete promptly, it is
+    /// important that the holders of such guards are respecting a CancellationToken which has
+    /// been cancelled before entering this function.
+    pub async fn close(&self) {
+        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
+    }
+
+    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
+    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
+    /// the CancellationToken on such types is analogous to "Did shutdown start?"
+    pub fn close_complete(&self) -> bool {
+        self.sem.is_closed()
+    }
+
+    async fn do_close(&self) {
+        tracing::debug!(gate = self.name, "Closing Gate...");
+        match self.sem.acquire_many(Self::MAX_UNITS).await {
+            Ok(_units) => {
+                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
+                self.sem.close();
+            }
+            Err(_) => {
+                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
+                // This is legal.  Timeline::shutdown for example is not protected from being called more than
+                // once.
+                tracing::debug!(gate = self.name, "Double close")
+            }
+        }
+        tracing::debug!(gate = self.name, "Closed Gate.")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::FutureExt;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_idle_gate() {
+        // Having taken no gates, we should not be blocked in close
+        let gate = Gate::new("test".to_string());
+        gate.close().await;
+
+        // If a guard is dropped before entering, close should not be blocked
+        let gate = Gate::new("test".to_string());
+        let guard = gate.enter().unwrap();
+        drop(guard);
+        gate.close().await;
+
+        // Entering a closed guard fails
+        gate.enter().expect_err("enter should fail after close");
+    }
+
+    #[tokio::test]
+    async fn test_busy_gate() {
+        let gate = Gate::new("test".to_string());
+
+        let guard = gate.enter().unwrap();
+
+        let mut close_fut = std::pin::pin!(gate.close());
+
+        // Close should be blocked
+        assert!(close_fut.as_mut().now_or_never().is_none());
+
+        // Attempting to enter() should fail, even though close isn't done yet.
+        gate.enter()
+            .expect_err("enter should fail after entering close");
+
+        drop(guard);
+
+        // Guard is gone, close should finish
+        assert!(close_fut.as_mut().now_or_never().is_some());
+
+        // Attempting to enter() is still forbidden
+        gate.enter().expect_err("enter should fail finishing close");
+    }
+}
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -0,0 +1,37 @@
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+
+pub enum TimeoutCancellableError {
+    Timeout,
+    Cancelled,
+}
+
+/// Wrap [`tokio::time::timeout`] with a CancellationToken.
+///
+/// This wrapper is appropriate for any long running operation in a task
+/// that ought to respect a CancellationToken (which means most tasks).
+///
+/// The only time you should use a bare tokio::timeout is when the future `F`
+/// itself respects a CancellationToken: otherwise, always use this wrapper
+/// with your CancellationToken to ensure that your task does not hold up
+/// graceful shutdown.
+pub async fn timeout_cancellable<F>(
+    duration: Duration,
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<F::Output, TimeoutCancellableError>
+where
+    F: std::future::Future,
+{
+    tokio::select!(
+        r = tokio::time::timeout(duration, future) => {
+            r.map_err(|_| TimeoutCancellableError::Timeout)
+
+        },
+        _ = cancel.cancelled() => {
+            Err(TimeoutCancellableError::Cancelled)
+
+        }
+    )
+}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,13 +19,12 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
    }
 }

+#[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
    sk: *mut Safekeeper,
    buf: *mut ::std::os::raw::c_char,
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
 }

 /// Take ownership of `Vec<u8>` from StringInfoData.
+#[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
    if pg.data.is_null() {
        return None;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
            .unwrap()
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -68,7 +68,6 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
-pageserver_compaction.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -1,53 +0,0 @@
-[package]
-name = "pageserver_compaction"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[features]
-default = []
-
-[dependencies]
-anyhow.workspace = true
-async-compression.workspace = true
-async-stream.workspace = true
-async-trait.workspace = true
-byteorder.workspace = true
-bytes.workspace = true
-chrono = { workspace = true, features = ["serde"] }
-clap = { workspace = true, features = ["string"] }
-const_format.workspace = true
-consumption_metrics.workspace = true
-crossbeam-utils.workspace = true
-either.workspace = true
-flate2.workspace = true
-fail.workspace = true
-futures.workspace = true
-git-version.workspace = true
-hex.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
-itertools.workspace = true
-once_cell.workspace = true
-pin-project-lite.workspace = true
-rand.workspace = true
-smallvec = { workspace = true, features = ["write"] }
-svg_fmt.workspace = true
-sync_wrapper.workspace = true
-thiserror.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
-tokio-io-timeout.workspace = true
-tokio-util.workspace = true
-tracing.workspace = true
-tracing-error.workspace = true
-tracing-subscriber.workspace = true
-url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
-[dev-dependencies]
-criterion.workspace = true
-hex-literal.workspace = true
-tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
--- a/pageserver/compaction/TODO.md
+++ b/pageserver/compaction/TODO.md
@@ -1,49 +0,0 @@
-# TODO
-
- If the key space can be perfectly partitioned at some key, perform planning on each
-  partition separately. For example, if we are compacting a level with layers like this:
-
-              :
-  +--+ +----+ :  +------+
-  |  | |    | :  |      |
-  +--+ +----+ :  +------+
-              :
-  +-----+ +-+ : +--------+
-  |     | | | : |        |
-  +-----+ +-+ : +--------+
-              :
-
-  At the dotted line, there is a natural split in the key space, such that all
-  layers are either on the left or the right of it. We can compact the
-  partitions separately.  We could choose to create image layers for one
-  partition but not the other one, for example.
-
- All the layers don't have to be exactly the same size, we can choose to cut a
-  layer short or stretch it a little larger than the target size, if it helps
-  the overall system. We can help perfect partitions (see previous bullet point)
-  to happen more frequently, by choosing the cut points wisely. For example, try
-  to cut layers at boundaries of underlying image layers. And "snap to grid",
-  i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
-
- Avoid rewriting layers when we'd just create an identical layer to an input
-  layer.
-
- Parallelism. The code is already split up into planning and execution, so that
-  we first split up the compaction work into "Jobs", and then execute them.
-  It would be straightforward to execute multiple jobs in parallel.
-
- Materialize extra pages in delta layers during compaction. This would reduce
-  read amplification. There has been the idea of partial image layers. Materializing
-  extra pages in the delta layers achieve the same goal, without introducing a new
-  concept.
-
-## Simulator
-
- Expand the simulator for more workloads
- Automate a test suite that runs the simluator with different workloads and
-  spits out a table of results
- Model read amplification
- More sanity checking. One idea is to keep a reference count of each
-  MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
-  a MockRecord that is newer than PITR horizon is completely dropped. That would
-  indicate that the record was lost.
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,214 +0,0 @@
-use clap::{Parser, Subcommand};
-use pageserver_compaction::simulator::MockTimeline;
-use rand::Rng;
-use std::io::Write;
-use std::path::{Path, PathBuf};
-use std::sync::OnceLock;
-
-use utils::project_git_version;
-
-project_git_version!(GIT_VERSION);
-
-#[derive(Parser)]
-#[command(
-    version = GIT_VERSION,
-    about = "Neon Pageserver compaction simulator",
-    long_about = "A developer tool to visualize and test compaction"
-)]
-#[command(propagate_version = true)]
-struct CliOpts {
-    #[command(subcommand)]
-    command: Commands,
-}
-
-#[derive(Subcommand)]
-enum Commands {
-    RunSuite,
-    Simulate(SimulateCmd),
-}
-
-#[derive(Clone, clap::ValueEnum)]
-enum Distribution {
-    Uniform,
-    HotCold,
-}
-
-/// Read and update pageserver metadata file
-#[derive(Parser)]
-struct SimulateCmd {
-    distribution: Distribution,
-
-    /// Number of records to digest
-    num_records: u64,
-    /// Record length
-    record_len: u64,
-
-    // Logical database size in MB
-    logical_size: u64,
-}
-
-async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
-    let mut executor = MockTimeline::new();
-
-    // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
-    //let key_range = u64::MIN..u64::MAX;
-    println!(
-        "starting simulation with key range {:016X}-{:016X}",
-        key_range.start, key_range.end
-    );
-
-    // helper function to print progress indicator
-    let print_progress = |i| -> anyhow::Result<()> {
-        if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
-            print!(
-                "\ringested {} / {} records, {} MiB / {} MiB...",
-                i + 1,
-                cmd.num_records,
-                (i + 1) * cmd.record_len / (1_000_000),
-                cmd.num_records * cmd.record_len / (1_000_000),
-            );
-            std::io::stdout().flush()?;
-        }
-        Ok(())
-    };
-
-    match cmd.distribution {
-        Distribution::Uniform => {
-            for i in 0..cmd.num_records {
-                executor.ingest_uniform(1, cmd.record_len, &key_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-        Distribution::HotCold => {
-            let splitpoint = key_range.end / 10;
-            let hot_key_range = 0..splitpoint;
-            let cold_key_range = splitpoint..key_range.end;
-
-            for i in 0..cmd.num_records {
-                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
-                    &hot_key_range
-                } else {
-                    &cold_key_range
-                };
-                executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
-                executor.compact_if_needed().await?;
-
-                print_progress(i)?;
-            }
-        }
-    }
-    println!("done!");
-    executor.flush_l0();
-    executor.compact_if_needed().await?;
-    let stats = executor.print_stats()?;
-
-    // Print the stats to stdout, and also to a file
-    print!("{}", stats);
-    std::fs::write(results_path.join("stats.txt"), stats)?;
-
-    let animation_path = results_path.join("compaction-animation.html");
-    executor.draw_history(std::fs::File::create(&animation_path)?)?;
-    println!(
-        "animation: file://{}",
-        animation_path.canonicalize()?.display()
-    );
-
-    Ok(())
-}
-
-async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
-    std::fs::create_dir(results_path)?;
-
-    set_log_file(File::create(results_path.join("log"))?);
-    let result = simulate(workload, results_path).await;
-    set_log_stdout();
-    result
-}
-
-async fn run_suite() -> anyhow::Result<()> {
-    let top_results_path = PathBuf::from(format!(
-        "compaction-suite-results.{}",
-        std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
-    ));
-    std::fs::create_dir(&top_results_path)?;
-
-    let workload = SimulateCmd {
-        distribution: Distribution::Uniform,
-        // Generate 20 GB of WAL
-        record_len: 1_000,
-        num_records: 20_000_000,
-        // Logical size 5 GB
-        logical_size: 5_000,
-    };
-
-    run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
-
-    println!(
-        "All tests finished. Results in {}",
-        top_results_path.display()
-    );
-    Ok(())
-}
-
-use std::fs::File;
-use std::io::Stdout;
-use std::sync::Mutex;
-use tracing_subscriber::fmt::writer::EitherWriter;
-use tracing_subscriber::fmt::MakeWriter;
-
-static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
-fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
-    LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
-}
-
-fn set_log_file(f: File) {
-    *get_log_output().lock().unwrap() = EitherWriter::A(f);
-}
-
-fn set_log_stdout() {
-    *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
-}
-
-fn init_logging() -> anyhow::Result<()> {
-    // We fall back to printing all spans at info-level or above if
-    // the RUST_LOG environment variable is not set.
-    let rust_log_env_filter = || {
-        tracing_subscriber::EnvFilter::try_from_default_env()
-            .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
-    };
-
-    // NB: the order of the with() calls does not matter.
-    // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
-    use tracing_subscriber::prelude::*;
-    tracing_subscriber::registry()
-        .with({
-            let log_layer = tracing_subscriber::fmt::layer()
-                .with_target(false)
-                .with_ansi(false)
-                .with_writer(|| get_log_output().make_writer());
-            log_layer.with_filter(rust_log_env_filter())
-        })
-        .init();
-
-    Ok(())
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = CliOpts::parse();
-
-    init_logging()?;
-
-    match cli.command {
-        Commands::Simulate(cmd) => {
-            simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
-        }
-        Commands::RunSuite => {
-            run_suite().await?;
-        }
-    };
-    Ok(())
-}
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -1,870 +0,0 @@
-//! # Tiered compaction algorithm.
-//!
-//! Read all the input delta files, and write a new set of delta files that
-//! include all the input WAL records. See retile_deltas().
-//!
-//! In a "normal" LSM tree, you get to remove any values that are overwritten by
-//! later values, but in our system, we keep all the history. So the reshuffling
-//! doesn't remove any garbage, it just reshuffles the records to reduce read
-//! amplification, i.e. the number of files that you need to access to find the
-//! WAL records for a given key.
-//!
-//! If the new delta files would be very "narrow", i.e. each file would cover
-//! only a narrow key range, then we create a new set of image files
-//! instead. The current threshold is that if the estimated total size of the
-//! image layers is smaller than the size of the deltas, then we create image
-//! layers. That amounts to 2x storage amplification, and it means that the
-//! distance of image layers in LSN dimension is roughly equal to the logical
-//! database size. For example, if the logical database size is 10 GB, we would
-//! generate new image layers every 10 GB of WAL.
-//!
-use futures::StreamExt;
-use tracing::{debug, info};
-
-use std::collections::{HashSet, VecDeque};
-use std::ops::Range;
-
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
-use crate::interface::*;
-use utils::lsn::Lsn;
-
-use crate::identify_levels::identify_level;
-
-/// Main entry point to compaction.
-///
-/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
-/// everything below that point, that needs compaction. The cutoff LSN must
-/// partition the layers so that there are no layers that span across that
-/// LSN. To start compaction at the top of the tree, pass the end LSN of the
-/// written last L0 layer.
-pub async fn compact_tiered<E: CompactionJobExecutor>(
-    executor: &mut E,
-    end_lsn: Lsn,
-    target_file_size: u64,
-    fanout: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<()> {
-    assert!(fanout >= 2);
-    // Start at L0
-    let mut current_level_no = 0;
-    let mut current_level_target_height = target_file_size;
-    loop {
-        // end LSN +1 to include possible image layers exactly at 'end_lsn'.
-        let all_layers = executor
-            .get_layers(
-                &(E::Key::MIN..E::Key::MAX),
-                &(Lsn(u64::MIN)..end_lsn + 1),
-                ctx,
-            )
-            .await?;
-        info!(
-            "Compacting L{}, total # of layers: {}",
-            current_level_no,
-            all_layers.len()
-        );
-
-        // Identify the range of LSNs that belong to this level. We assume that
-        // each file in this level span an LSN range up to 1.75x target file
-        // size. That should give us enough slop that if we created a slightly
-        // oversized L0 layer, e.g. because flushing the in-memory layer was
-        // delayed for some reason, we don't consider the oversized layer to
-        // belong to L1. But not too much slop, that we don't accidentally
-        // "skip" levels.
-        let max_height = (current_level_target_height as f64 * 1.75) as u64;
-        let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
-            break;
-        };
-
-        // Calculate the height of this level. If the # of tiers exceeds the
-        // fanout parameter, it's time to compact it.
-        let depth = level.depth();
-        info!(
-            "Level {} identified as LSN range {}-{}: depth {}",
-            current_level_no, level.lsn_range.start, level.lsn_range.end, depth
-        );
-        for l in &level.layers {
-            debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
-        }
-        if depth < fanout {
-            debug!(
-                level = current_level_no,
-                depth = depth,
-                fanout,
-                "too few deltas to compact"
-            );
-            break;
-        }
-
-        compact_level(
-            &level.lsn_range,
-            &level.layers,
-            executor,
-            target_file_size,
-            ctx,
-        )
-        .await?;
-        if target_file_size == u64::MAX {
-            break;
-        }
-        current_level_no += 1;
-        current_level_target_height = current_level_target_height.saturating_mul(fanout);
-    }
-    Ok(())
-}
-
-async fn compact_level<E: CompactionJobExecutor>(
-    lsn_range: &Range<Lsn>,
-    layers: &[E::Layer],
-    executor: &mut E,
-    target_file_size: u64,
-    ctx: &E::RequestContext,
-) -> anyhow::Result<bool> {
-    let mut layer_fragments = Vec::new();
-    for l in layers {
-        layer_fragments.push(LayerFragment::new(l.clone()));
-    }
-
-    let mut state = LevelCompactionState {
-        target_file_size,
-        _lsn_range: lsn_range.clone(),
-        layers: layer_fragments,
-        jobs: Vec::new(),
-        job_queue: Vec::new(),
-        next_level: false,
-        executor,
-    };
-
-    let first_job = CompactionJob {
-        key_range: E::Key::MIN..E::Key::MAX,
-        lsn_range: lsn_range.clone(),
-        strategy: CompactionStrategy::Divide,
-        input_layers: state
-            .layers
-            .iter()
-            .enumerate()
-            .map(|i| LayerId(i.0))
-            .collect(),
-        completed: false,
-    };
-
-    state.jobs.push(first_job);
-    state.job_queue.push(JobId(0));
-    state.execute(ctx).await?;
-
-    info!(
-        "compaction completed! Need to process next level: {}",
-        state.next_level
-    );
-
-    Ok(state.next_level)
-}
-
-/// Blackboard that keeps track of the state of all the jobs and work remaining
-struct LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    // parameters
-    target_file_size: u64,
-
-    _lsn_range: Range<Lsn>,
-    layers: Vec<LayerFragment<E>>,
-
-    // job queue
-    jobs: Vec<CompactionJob<E>>,
-    job_queue: Vec<JobId>,
-
-    /// If false, no need to compact levels below this
-    next_level: bool,
-
-    /// Interface to the outside world
-    executor: &'a mut E,
-}
-
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct LayerId(usize);
-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
-struct JobId(usize);
-
-struct PendingJobSet {
-    pending: HashSet<JobId>,
-    completed: HashSet<JobId>,
-}
-
-impl PendingJobSet {
-    fn new() -> Self {
-        PendingJobSet {
-            pending: HashSet::new(),
-            completed: HashSet::new(),
-        }
-    }
-
-    fn complete_job(&mut self, job_id: JobId) {
-        self.pending.remove(&job_id);
-        self.completed.insert(job_id);
-    }
-
-    fn all_completed(&self) -> bool {
-        self.pending.is_empty()
-    }
-}
-
-// When we decide to rewrite a set of layers, LayerFragment is used to keep
-// track which new layers supersede an old layer. When all the stakeholder jobs
-// have completed, this layer can be deleted.
-struct LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    layer: E::Layer,
-
-    // If we will write new layers to replace this one, this keeps track of the
-    // jobs that need to complete before this layer can be deleted. As the jobs
-    // complete, they are moved from 'pending' to 'completed' set. Once the
-    // 'pending' set becomes empty, the layer can be deleted.
-    //
-    // If None, this layer is not rewritten and must not be deleted.
-    deletable_after: Option<PendingJobSet>,
-
-    deleted: bool,
-}
-
-impl<E> LayerFragment<E>
-where
-    E: CompactionJobExecutor,
-{
-    fn new(layer: E::Layer) -> Self {
-        LayerFragment {
-            layer,
-            deletable_after: None,
-            deleted: false,
-        }
-    }
-}
-
-#[derive(PartialEq)]
-enum CompactionStrategy {
-    Divide,
-    CreateDelta,
-    CreateImage,
-}
-
-#[allow(dead_code)] // Todo
-struct CompactionJob<E: CompactionJobExecutor> {
-    key_range: Range<E::Key>,
-    lsn_range: Range<Lsn>,
-
-    strategy: CompactionStrategy,
-
-    input_layers: Vec<LayerId>,
-
-    completed: bool,
-}
-
-impl<'a, E> LevelCompactionState<'a, E>
-where
-    E: CompactionJobExecutor,
-{
-    /// Main loop of the executor.
-    ///
-    /// In each iteration, we take the next job from the queue, and execute it.
-    /// The execution might add new jobs to the queue. Keep going until the
-    /// queue is empty.
-    ///
-    /// Initially, the job queue consists of one Divide job over the whole
-    /// level. On first call, it is divided into smaller jobs.
-    ///
-    async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        // TODO: this would be pretty straightforward to parallelize with FuturesUnordered
-        while let Some(next_job_id) = self.job_queue.pop() {
-            info!("executing job {}", next_job_id.0);
-            self.execute_job(next_job_id, ctx).await?;
-        }
-
-        // all done!
-        Ok(())
-    }
-
-    async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        match job.strategy {
-            CompactionStrategy::Divide => {
-                self.divide_job(job_id, ctx).await?;
-                Ok(())
-            }
-            CompactionStrategy::CreateDelta => {
-                let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-                let mut layer_ids: Vec<LayerId> = Vec::new();
-                for layer_id in &job.input_layers {
-                    let layer = &self.layers[layer_id.0].layer;
-                    if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
-                        deltas.push(dl.clone());
-                        layer_ids.push(*layer_id);
-                    }
-                }
-
-                self.executor
-                    .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // did we complete any fragments?
-                for layer_id in layer_ids {
-                    let l = &mut self.layers[layer_id.0];
-                    if let Some(deletable_after) = l.deletable_after.as_mut() {
-                        deletable_after.complete_job(job_id);
-                        if deletable_after.all_completed() {
-                            self.executor.delete_layer(&l.layer, ctx).await?;
-                            l.deleted = true;
-                        }
-                    }
-                }
-
-                self.next_level = true;
-
-                Ok(())
-            }
-            CompactionStrategy::CreateImage => {
-                self.executor
-                    .create_image(job.lsn_range.end, &job.key_range, ctx)
-                    .await?;
-                self.jobs[job_id.0].completed = true;
-
-                // TODO: we could check if any layers < PITR horizon became deletable
-                Ok(())
-            }
-        }
-    }
-
-    fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
-        let job_id = JobId(self.jobs.len());
-        self.jobs.push(job);
-        self.job_queue.push(job_id);
-        job_id
-    }
-
-    ///
-    /// Take a partition of the key space, and decide how to compact it.
-    ///
-    /// TODO: Currently, this is called exactly once for the level, and we
-    /// decide whether to create new image layers to cover the whole level, or
-    /// write a new set of delta. In the future, this should try to partition
-    /// the key space, and make the decision separately for each partition.
-    ///
-    async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Check for dummy cases
-        if job.input_layers.is_empty() {
-            return Ok(());
-        }
-
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Would it be better to create images for this partition?
-        // Decide based on the average density of the level
-        let keyspace_size = keyspace_total_size(
-            &self
-                .executor
-                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-                .await?,
-        ) * 8192;
-
-        let wal_size = job
-            .input_layers
-            .iter()
-            .filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
-            .map(|layer_id| self.layers[layer_id.0].layer.file_size())
-            .sum::<u64>();
-        if keyspace_size < wal_size {
-            // seems worth it
-            info!(
-                "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
-                keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
-            );
-            self.cover_with_images(job_id, ctx).await
-        } else {
-            // do deltas
-            info!(
-                "coverage not worth it, keyspace_size {}, wal_size {}",
-                keyspace_size, wal_size
-            );
-            self.retile_deltas(job_id, ctx).await
-        }
-    }
-
-    // LSN
-    //  ^
-    //  |
-    //  |                          ###|###|#####
-    //  | +--+-----+--+            +--+-----+--+
-    //  | |  |     |  |            |  |     |  |
-    //  | +--+--+--+--+            +--+--+--+--+
-    //  | |     |     |            |     |     |
-    //  | +---+-+-+---+     ==>    +---+-+-+---+
-    //  | |   |   |   |            |   |   |   |
-    //  | +---+-+-++--+            +---+-+-++--+
-    //  | |     |  |  |            |     |  |  |
-    //  | +-----+--+--+            +-----+--+--+
-    //  |
-    //  +--------------> key
-    //
-    async fn cover_with_images(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // XXX: do we still need the "holes" stuff?
-
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let keyspace = self
-            .executor
-            .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
-            .await?;
-
-        let mut window = KeyspaceWindow::new(
-            E::Key::MIN..E::Key::MAX,
-            keyspace,
-            self.target_file_size / 8192,
-        );
-        while let Some(key_range) = window.choose_next_image() {
-            new_jobs.push(CompactionJob::<E> {
-                key_range,
-                lsn_range: job.lsn_range.clone(),
-                strategy: CompactionStrategy::CreateImage,
-                input_layers: Vec::new(), // XXX: Is it OK for  this to be empty for image layer?
-                completed: false,
-            });
-        }
-
-        for j in new_jobs.into_iter().rev() {
-            let _job_id = self.push_job(j);
-
-            // TODO: image layers don't let us delete anything. unless < PITR horizon
-            //let j = &self.jobs[job_id.0];
-            // for layer_id in j.input_layers.iter() {
-            //    self.layers[layer_id.0].pending_stakeholders.insert(job_id);
-            //}
-        }
-
-        Ok(())
-    }
-
-    // Merge the contents of all the input delta layers into a new set
-    // of delta layers, based on the current partitioning.
-    //
-    // We split the new delta layers on the key dimension. We iterate through
-    // the key space, and for each key, check if including the next key to the
-    // current output layer we're building would cause the layer to become too
-    // large. If so, dump the current output layer and start new one.  It's
-    // possible that there is a single key with so many page versions that
-    // storing all of them in a single layer file would be too large. In that
-    // case, we also split on the LSN dimension.
-    //
-    // LSN
-    //  ^
-    //  |
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    //
-    // If one key (X) has a lot of page versions:
-    //
-    // LSN
-    //  ^
-    //  |                                 (X)
-    //  | +-----------+            +--+--+--+--+
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            |  |  +--+  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+     ==>    |  |  |  |  |
-    //  | |           |            |  |  +--+  |
-    //  | +-----------+            |  |  |  |  |
-    //  | |           |            |  |  |  |  |
-    //  | +-----------+            +--+--+--+--+
-    //  |
-    //  +--------------> key
-    //
-    // TODO: this actually divides the layers into fixed-size chunks, not
-    // based on the partitioning.
-    //
-    // TODO: we should also opportunistically materialize and
-    // garbage collect what we can.
-    async fn retile_deltas(
-        &mut self,
-        job_id: JobId,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<()> {
-        let job = &self.jobs[job_id.0];
-        assert!(job.strategy == CompactionStrategy::Divide);
-
-        // Sweep the key space left to right, running an estimate of how much
-        // disk size and keyspace we have accumulated
-        //
-        // Once the disk size reaches the target threshold, stop and think.
-        // If we have accumulated only a narrow band of keyspace, create an
-        // image layer. Otherwise write a delta layer.
-
-        // FIXME: deal with the case of lots of values for same key
-
-        // FIXME: we are ignoring images here. Did we already divide the work
-        // so that we won't encounter them here?
-
-        let mut deltas: Vec<E::DeltaLayer> = Vec::new();
-        for layer_id in &job.input_layers {
-            let l = &self.layers[layer_id.0];
-            if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
-                deltas.push(dl.clone());
-            }
-        }
-        // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
-        let mut new_jobs = Vec::new();
-
-        // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
-        let mut all_in_window: bool = false;
-        let mut window = Window::new();
-        loop {
-            if all_in_window && window.elems.is_empty() {
-                // All done!
-                break;
-            }
-            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
-            {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
-                    all_in_window = true;
-                }
-            }
-        }
-
-        // All the input files are rewritten. Set up the tracking for when they can
-        // be deleted.
-        for layer_id in job.input_layers.iter() {
-            let l = &mut self.layers[layer_id.0];
-            assert!(l.deletable_after.is_none());
-            l.deletable_after = Some(PendingJobSet::new());
-        }
-        for j in new_jobs.into_iter().rev() {
-            let job_id = self.push_job(j);
-            let j = &self.jobs[job_id.0];
-            for layer_id in j.input_layers.iter() {
-                self.layers[layer_id.0]
-                    .deletable_after
-                    .as_mut()
-                    .unwrap()
-                    .pending
-                    .insert(job_id);
-            }
-        }
-
-        Ok(())
-    }
-}
-
-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
-struct KeyspaceWindow<K> {
-    head: KeyspaceWindowHead<K>,
-
-    start_pos: KeyspaceWindowPos<K>,
-}
-struct KeyspaceWindowHead<K> {
-    // overall key range to cover
-    key_range: Range<K>,
-
-    keyspace: Vec<Range<K>>,
-    target_keysize: u64,
-}
-
-#[derive(Clone)]
-struct KeyspaceWindowPos<K> {
-    end_key: K,
-
-    keyspace_idx: usize,
-
-    accum_keysize: u64,
-}
-impl<K: CompactionKey> KeyspaceWindowPos<K> {
-    fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
-        self.keyspace_idx == w.keyspace.len()
-    }
-
-    // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
-        while self.accum_keysize < max_size && !self.reached_end(w) {
-            let curr_range = &w.keyspace[self.keyspace_idx];
-            if self.end_key < curr_range.start {
-                // skip over any unused space
-                self.end_key = curr_range.start;
-            }
-
-            // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end));
-            if (self.accum_keysize + distance as u64) < max_size {
-                // oh yeah, it fits
-                self.end_key = curr_range.end;
-                self.keyspace_idx += 1;
-                self.accum_keysize += distance as u64;
-            } else {
-                // advance within the range
-                let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key));
-                if (self.accum_keysize + distance as u64) < max_size {
-                    self.end_key = skip_key;
-                    self.accum_keysize += distance as u64;
-                } else {
-                    self.end_key = self.end_key.next();
-                    self.accum_keysize += 1;
-                }
-            }
-        }
-    }
-}
-
-impl<K> KeyspaceWindow<K>
-where
-    K: CompactionKey,
-{
-    fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
-        assert!(keyspace.first().unwrap().start >= key_range.start);
-
-        let start_key = key_range.start;
-        let start_pos = KeyspaceWindowPos::<K> {
-            end_key: start_key,
-            keyspace_idx: 0,
-            accum_keysize: 0,
-        };
-        Self {
-            head: KeyspaceWindowHead::<K> {
-                key_range,
-                keyspace,
-                target_keysize,
-            },
-            start_pos,
-        }
-    }
-
-    fn choose_next_image(&mut self) -> Option<Range<K>> {
-        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
-            // we've reached the end
-            return None;
-        }
-
-        let mut next_pos = self.start_pos.clone();
-        next_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + self.head.target_keysize,
-        );
-
-        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
-        // 1.25x target size
-        let mut end_pos = next_pos.clone();
-        end_pos.advance_until_size(
-            &self.head,
-            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
-        );
-        if end_pos.reached_end(&self.head) {
-            // gobble up any unused keyspace between the last used key and end of the range
-            assert!(end_pos.end_key <= self.head.key_range.end);
-            end_pos.end_key = self.head.key_range.end;
-            next_pos = end_pos;
-        }
-
-        let start_key = self.start_pos.end_key;
-        self.start_pos = next_pos;
-        Some(start_key..self.start_pos.end_key)
-    }
-}
-
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
-//
-// Candidates:
-//
-// 1. Create an image layer, snapping to previous images
-// 2. Create a delta layer, snapping to previous images
-// 3. Create an image layer, snapping to
-//
-//
-
-// Take previous partitioning, based on the image layers below.
-//
-// Candidate is at the front:
-//
-// Consider stretching an image layer to next divider? If it's close enough,
-// that's the image candidate
-//
-// If it's too far, consider splitting at a reasonable point
-//
-// Is the image candidate smaller than the equivalent delta? If so,
-// split off the image. Otherwise, split off one delta.
-// Try to snap off the delta at a reasonable point
-
-struct WindowElement<K> {
-    start_key: K, // inclusive
-    last_key: K,  // inclusive
-    accum_size: u64,
-}
-struct Window<K> {
-    elems: VecDeque<WindowElement<K>>,
-
-    // last key that was split off, inclusive
-    splitoff_key: Option<K>,
-    splitoff_size: u64,
-}
-
-impl<K> Window<K>
-where
-    K: CompactionKey,
-{
-    fn new() -> Self {
-        Self {
-            elems: VecDeque::new(),
-            splitoff_key: None,
-            splitoff_size: 0,
-        }
-    }
-
-    fn feed(&mut self, key: K, size: u64) {
-        let last_size;
-        if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
-            last_size = last.accum_size;
-        } else {
-            last_size = 0;
-        }
-        // This is a new key.
-        let elem = WindowElement {
-            start_key: key,
-            last_key: key,
-            accum_size: last_size + size,
-        };
-        self.elems.push_back(elem);
-    }
-
-    fn remain_size(&self) -> u64 {
-        self.elems.back().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn peek_size(&self) -> u64 {
-        self.elems.front().unwrap().accum_size - self.splitoff_size
-    }
-
-    fn commit_upto(&mut self, mut upto: usize) {
-        while upto > 1 {
-            let popped = self.elems.pop_front().unwrap();
-            self.elems.front_mut().unwrap().start_key = popped.start_key;
-            upto -= 1;
-        }
-    }
-
-    fn find_size_split(&self, target_size: u64) -> usize {
-        self.elems
-            .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
-    }
-
-    fn pop(&mut self) {
-        let first = self.elems.pop_front().unwrap();
-        self.splitoff_size = first.accum_size;
-
-        self.splitoff_key = Some(first.last_key);
-    }
-
-    // the difference between delta and image is that an image covers
-    // any unused keyspace before and after, while a delta tries to
-    // minimize that. TODO: difference not implemented
-    fn pop_delta(&mut self) -> Range<K> {
-        let first = self.elems.front().unwrap();
-        let key_range = first.start_key..first.last_key.next();
-
-        self.pop();
-        key_range
-    }
-
-    // Prerequisite: we have enough input in the window
-    //
-    // On return None, the caller should feed more data and call again
-    fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
-        if has_more && self.elems.is_empty() {
-            // Starting up
-            return None;
-        }
-
-        // If we still have an undersized candidate, just keep going
-        while self.peek_size() < target_size {
-            if self.elems.len() > 1 {
-                self.commit_upto(2);
-            } else if has_more {
-                return None;
-            } else {
-                break;
-            }
-        }
-
-        // Ensure we have enough input in the window to make a good decision
-        if has_more && self.remain_size() < target_size * 5 / 4 {
-            return None;
-        }
-
-        // The candidate on the front is now large enough, for a delta.
-        // And we have enough data in the window to decide.
-
-        // If we're willing to stretch it up to 1.25 target size, could we
-        // gobble up the rest of the work? This avoids creating very small
-        // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
-            self.commit_upto(self.elems.len());
-        } else {
-            let delta_split_at = self.find_size_split(target_size);
-            self.commit_upto(delta_split_at);
-
-            // If it's still not large enough, request the caller to fill the window
-            if self.elems.len() == 1 && has_more {
-                return None;
-            }
-        }
-        Some(self.pop_delta())
-    }
-}
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -1,251 +0,0 @@
-//! This file contains generic utility functions over the interface types,
-//! which could be handy for any compaction implementation.
-use crate::interface::*;
-
-use futures::future::BoxFuture;
-use futures::{Stream, StreamExt};
-use itertools::Itertools;
-use pin_project_lite::pin_project;
-use std::cmp::Ord;
-use std::collections::BinaryHeap;
-use std::collections::VecDeque;
-use std::future::Future;
-use std::ops::{DerefMut, Range};
-use std::pin::Pin;
-use std::task::Poll;
-
-pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
-where
-    K: CompactionKey,
-{
-    let mut total = 0;
-    for r in keyspace.iter() {
-        total += K::key_range_size(r) as u64;
-    }
-    total
-}
-
-pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
-    !(a.end <= b.start || b.end <= a.start)
-}
-
-pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
-    let x = std::mem::take(a);
-    let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
-        .into_iter()
-        .kmerge_by(|a, b| a.start < b.start);
-    let mut ranges = Vec::new();
-    if let Some(first) = all_ranges_iter.next() {
-        let (mut start, mut end) = (first.start, first.end);
-
-        for r in all_ranges_iter {
-            assert!(r.start >= start);
-            if r.start > end {
-                ranges.push(start..end);
-                start = r.start;
-                end = r.end;
-            } else if r.end > end {
-                end = r.end;
-            }
-        }
-        ranges.push(start..end);
-    }
-    *a = ranges
-}
-
-pub fn intersect_keyspace<K: Ord + Clone + Copy>(
-    a: &CompactionKeySpace<K>,
-    r: &Range<K>,
-) -> CompactionKeySpace<K> {
-    let mut ranges: Vec<Range<K>> = Vec::new();
-
-    for x in a.iter() {
-        if x.end <= r.start {
-            continue;
-        }
-        if x.start >= r.end {
-            break;
-        }
-        ranges.push(x.clone())
-    }
-
-    // trim the ends
-    if let Some(first) = ranges.first_mut() {
-        first.start = std::cmp::max(first.start, r.start);
-    }
-    if let Some(last) = ranges.last_mut() {
-        last.end = std::cmp::min(last.end, r.end);
-    }
-    ranges
-}
-
-/// Create a stream that iterates through all DeltaEntrys among all input
-/// layers, in key-lsn order.
-///
-/// This is public because the create_delta() implementation likely wants to use this too
-/// TODO: move to a more shared place
-pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
-    layers: &'a [E::DeltaLayer],
-    ctx: &'a E::RequestContext,
-) -> MergeDeltaKeys<'a, E> {
-    // Use a binary heap to merge the layers. Each input layer is initially
-    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
-    // the layer's key range as the key. The first time a layer reaches the top
-    // of the heap, all the keys of the layer are loaded into a sorted vector.
-    //
-    // This helps to keep the memory usage reasonable: we only need to hold in
-    // memory the DeltaEntrys of the layers that overlap with the "current" key.
-    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
-    for l in layers {
-        heap.push(LazyLoadLayer::Unloaded(l));
-    }
-    MergeDeltaKeys {
-        heap,
-        ctx,
-        load_future: None,
-    }
-}
-
-enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
-    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
-    Unloaded(&'a E::DeltaLayer),
-}
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
-        match self {
-            Self::Loaded(entries) => entries.front().unwrap().key(),
-            Self::Unloaded(dl) => dl.key_range().start,
-        }
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        // reverse order so that we get a min-heap
-        other.key().partial_cmp(&self.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
-    fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
-    }
-}
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
-
-type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
-
-// Stream returned by `merge_delta_keys`
-pin_project! {
-#[allow(clippy::type_complexity)]
-pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
-    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
-
-    #[pin]
-    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
-
-    ctx: &'a E::RequestContext,
-}
-}
-
-impl<'a, E> Stream for MergeDeltaKeys<'a, E>
-where
-    E: CompactionJobExecutor + 'a,
-{
-    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
-
-    fn poll_next(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
-        let mut this = self.project();
-        loop {
-            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
-                // We are waiting for loading the keys to finish
-                match load_future.as_mut().poll(cx) {
-                    Poll::Ready(Ok(entries)) => {
-                        this.load_future.set(None);
-                        *this.heap.peek_mut().unwrap() =
-                            LazyLoadLayer::Loaded(VecDeque::from(entries));
-                    }
-                    Poll::Ready(Err(e)) => {
-                        return Poll::Ready(Some(Err(e)));
-                    }
-                    Poll::Pending => {
-                        return Poll::Pending;
-                    }
-                }
-            }
-
-            // If the topmost layer in the heap hasn't been loaded yet, start
-            // loading it. Otherwise return the next entry from it and update
-            // the layer's position in the heap (this decreaseKey operation is
-            // performed implicitly when `top` is dropped).
-            if let Some(mut top) = this.heap.peek_mut() {
-                match top.deref_mut() {
-                    LazyLoadLayer::Unloaded(ref mut l) => {
-                        let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(Box::pin(fut)));
-                        continue;
-                    }
-                    LazyLoadLayer::Loaded(ref mut entries) => {
-                        let result = entries.pop_front().unwrap();
-                        if entries.is_empty() {
-                            std::collections::binary_heap::PeekMut::pop(top);
-                        }
-                        return Poll::Ready(Some(Ok(result)));
-                    }
-                }
-            } else {
-                return Poll::Ready(None);
-            }
-        }
-    }
-}
-
-// Accumulate values at key boundaries
-pub struct KeySize<K> {
-    pub key: K,
-    pub num_values: u64,
-    pub size: u64,
-}
-
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
-where
-    K: Eq,
-    I: Stream<Item = Result<D, E>>,
-    D: CompactionDeltaEntry<'a, K>,
-{
-    async_stream::try_stream! {
-        // Initialize the state from the first value
-        let mut input = std::pin::pin!(input);
-
-        if let Some(first) = input.next().await {
-            let first = first?;
-            let mut accum: KeySize<K> = KeySize {
-                key: first.key(),
-                num_values: 1,
-                size: first.size(),
-            };
-            while let Some(this) = input.next().await {
-                let this = this?;
-                if this.key() == accum.key {
-                    accum.size += this.size();
-                    accum.num_values += 1;
-                } else {
-                    yield accum;
-                    accum = KeySize {
-                        key: this.key(),
-                        num_values: 1,
-                        size: this.size(),
-                    };
-                }
-            }
-            yield accum;
-        }
-    }
-}
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -1,376 +0,0 @@
-//! An LSM tree consists of multiple levels, each exponential larger than the
-//! previous level. And each level consists of be multiple "tiers". With tiered
-//! compaction, a level is compacted when it has accumulated more than N tiers,
-//! forming one tier on the next level.
-//!
-//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
-//! we identify them by looking at the shapes of the layers. It's an easy task
-//! for a human, but it's not straightforward to come up with the exact
-//! rules. Especially if there are cases like interrupted, half-finished
-//! compactions, or highly skewed data distributions that have let us "skip"
-//! some levels. It's not critical to classify all cases correctly; at worst we
-//! delay some compaction work, and suffer from more read amplification, or we
-//! perform some unnecessary compaction work.
-//!
-//! `identify_level` performs that shape-matching.
-//!
-//! It returns a Level struct, which has `depth()` function to count the number
-//! of "tiers" in the level. The tier count is the max depth of stacked layers
-//! within the level. That's a good measure, because the point of compacting is
-//! to reduce read amplification, and the depth is what determines that.
-//!
-//! One interesting effect of this is that if we generate very small delta
-//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
-//! because they reach the target size, the L0 compaction will combine them to
-//! one larger file. But if the combined file is still smaller than the target
-//! file size, the file will still be considered to be part of L0 at the next
-//! iteration.
-
-use anyhow::bail;
-use std::collections::BTreeSet;
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-use crate::interface::*;
-
-use tracing::{info, trace};
-
-pub struct Level<L> {
-    pub lsn_range: Range<Lsn>,
-    pub layers: Vec<L>,
-}
-
-/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
-/// no layers that cross the boundary LSN.
-///
-/// A further restriction is that all layers in the returned partition cover at
-/// most 'lsn_max_size' LSN bytes.
-pub async fn identify_level<K, L>(
-    all_layers: Vec<L>,
-    end_lsn: Lsn,
-    lsn_max_size: u64,
-) -> anyhow::Result<Option<Level<L>>>
-where
-    K: CompactionKey,
-    L: CompactionLayer<K> + Clone,
-{
-    // filter out layers that are above the `end_lsn`, they are completely irrelevant.
-    let mut layers = Vec::new();
-    for l in all_layers {
-        if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
-            // shouldn't happen. Indicates that the caller passed a bogus
-            // end_lsn.
-            bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
-        }
-        // include image layers sitting exacty at `end_lsn`.
-        let is_image = !l.is_delta();
-        if (is_image && l.lsn_range().start > end_lsn)
-            || (!is_image && l.lsn_range().start >= end_lsn)
-        {
-            continue;
-        }
-        layers.push(l);
-    }
-    // All the remaining layers either belong to this level, or are below it.
-    info!(
-        "identify level at {}, size {}, num layers below: {}",
-        end_lsn,
-        lsn_max_size,
-        layers.len()
-    );
-    if layers.is_empty() {
-        return Ok(None);
-    }
-
-    // Walk the ranges in LSN order.
-    //
-    // ----- end_lsn
-    //  |
-    //  |
-    //  v
-    //
-    layers.sort_by_key(|l| l.lsn_range().end);
-    let mut candidate_start_lsn = end_lsn;
-    let mut candidate_layers: Vec<L> = Vec::new();
-    let mut current_best_start_lsn = end_lsn;
-    let mut current_best_layers: Vec<L> = Vec::new();
-    let mut iter = layers.into_iter();
-    loop {
-        let Some(l) = iter.next_back() else {
-            // Reached end. Accept the last candidate
-            current_best_start_lsn = candidate_start_lsn;
-            current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-            break;
-        };
-        trace!(
-            "inspecting {} for candidate {}, current best {}",
-            l.short_id(),
-            candidate_start_lsn,
-            current_best_start_lsn
-        );
-
-        let r = l.lsn_range();
-
-        // Image layers don't restrict our choice of cutoff LSN
-        if l.is_delta() {
-            // Is this candidate workable? In other words, are there any
-            // delta layers that span across this LSN
-            //
-            // Valid:                 Not valid:
-            //  +                     +
-            //  |                     | +
-            //  +  <- candidate       + |   <- candidate
-            //     +                    +
-            //     |
-            //     +
-            if r.end <= candidate_start_lsn {
-                // Hooray, there are no crossing LSNs. And we have visited
-                // through all the layers within candidate..end_lsn. The
-                // current candidate can be accepted.
-                current_best_start_lsn = r.end;
-                current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
-                candidate_start_lsn = r.start;
-            }
-
-            // Is it small enough to be considered part of this level?
-            if r.end.0 - r.start.0 > lsn_max_size {
-                // Too large, this layer belongs to next level. Stop.
-                trace!(
-                    "too large {}, size {} vs {}",
-                    l.short_id(),
-                    r.end.0 - r.start.0,
-                    lsn_max_size
-                );
-                break;
-            }
-
-            // If this crosses the candidate lsn, push it down.
-            if r.start < candidate_start_lsn {
-                trace!(
-                    "layer {} prevents from stopping at {}",
-                    l.short_id(),
-                    candidate_start_lsn
-                );
-                candidate_start_lsn = r.start;
-            }
-        }
-
-        // Include this layer in our candidate
-        candidate_layers.push(l);
-    }
-
-    Ok(if current_best_start_lsn == end_lsn {
-        // empty level
-        None
-    } else {
-        Some(Level {
-            lsn_range: current_best_start_lsn..end_lsn,
-            layers: current_best_layers,
-        })
-    })
-}
-
-// helper struct used in depth()
-struct Event<K> {
-    key: K,
-    layer_idx: usize,
-    start: bool,
-}
-
-impl<L> Level<L> {
-    /// Count the number of deltas stacked on each other.
-    pub fn depth<K>(&self) -> u64
-    where
-        K: CompactionKey,
-        L: CompactionLayer<K>,
-    {
-        let mut events: Vec<Event<K>> = Vec::new();
-        for (idx, l) in self.layers.iter().enumerate() {
-            events.push(Event {
-                key: l.key_range().start,
-                layer_idx: idx,
-                start: true,
-            });
-            events.push(Event {
-                key: l.key_range().end,
-                layer_idx: idx,
-                start: false,
-            });
-        }
-        events.sort_by_key(|e| (e.key, e.start));
-
-        // Sweep the key space left to right. Stop at each distinct key, and
-        // count the number of deltas on top of the highest image at that key.
-        //
-        // This is a little enefficient, as we walk through the active_set on
-        // every key. We could increment/decrement a counter on each step
-        // instead, but that'd require a bit more complex bookkeeping.
-        let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
-        let mut max_depth = 0;
-        let mut events_iter = events.iter().peekable();
-        while let Some(e) = events_iter.next() {
-            let l = &self.layers[e.layer_idx];
-            let is_image = !l.is_delta();
-
-            // update the active set
-            if e.start {
-                active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
-            } else {
-                active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
-            }
-
-            // recalculate depth if this was the last event at this point
-            let more_events_at_this_key = events_iter
-                .peek()
-                .map_or(false, |next_e| next_e.key == e.key);
-            if !more_events_at_this_key {
-                let mut active_depth = 0;
-                for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
-                    if *is_image {
-                        break;
-                    }
-                    active_depth += 1;
-                }
-                if active_depth > max_depth {
-                    max_depth = active_depth;
-                }
-            }
-        }
-        max_depth
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
-    use std::sync::{Arc, Mutex};
-
-    fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
-        MockLayer::Delta(Arc::new(MockDeltaLayer {
-            key_range,
-            lsn_range,
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-            records: vec![],
-        }))
-    }
-
-    fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
-        MockLayer::Image(Arc::new(MockImageLayer {
-            key_range,
-            lsn_range: lsn..(lsn + 1),
-            // identify_level() doesn't pay attention to the rest of the fields
-            file_size: 0,
-            deleted: Mutex::new(false),
-        }))
-    }
-
-    #[tokio::test]
-    async fn test_identify_level() -> anyhow::Result<()> {
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
-        ];
-
-        // All layers fit in the max file size
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 6);
-
-        // Same LSN with smaller max file size. The second layer from the top is larger
-        // and belongs to next level.
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        // Call with a smaller LSN
-        let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 2);
-
-        // Call with an LSN that doesn't partition the space
-        let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
-        assert!(result.is_err());
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
-        // The files LSN ranges overlap, so even though there are more files that
-        // fit under the file size, they are not included in the level because they
-        // overlap so that we'd need to include the oldest file, too, which is
-        // larger
-        let layers = vec![
-            delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
-            delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
-            delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
-            .await?
-            .unwrap();
-        assert_eq!(level.depth(), 1);
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
-        // The key ranges don't overlap, so depth is only 1.
-        let layers = vec![
-            delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
-            delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 1);
-
-        // Staggered. The 1st and 3rd layer don't overlap with each other.
-        let layers = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 3);
-        assert_eq!(level.depth(), 2);
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_depth_images() -> anyhow::Result<()> {
-        let layers: Vec<MockLayer> = vec![
-            delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
-            delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
-            delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
-            // This covers the same key range as the 2nd delta layer. The depth
-            // in that key range is therefore 0.
-            image(1500..2500, Lsn(0x9000)),
-        ];
-
-        let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
-            .await?
-            .unwrap();
-        assert_eq!(level.layers.len(), 4);
-        assert_eq!(level.depth(), 1);
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -1,152 +0,0 @@
-//! This is what the compaction implementation needs to know about
-//! layers, keyspace etc.
-//!
-//! All the heavy lifting is done by the create_image and create_delta
-//! functions that the implementor provides.
-//!
-use async_trait::async_trait;
-use std::ops::Range;
-use utils::lsn::Lsn;
-
-/// Public interface. This is the main thing that the implementor needs to provide
-#[async_trait]
-pub trait CompactionJobExecutor {
-    // Type system.
-    //
-    // We assume that there are two kinds of layers, deltas and images. The
-    // compaction doesn't distinguish whether they are stored locally or
-    // remotely.
-    //
-    // The keyspace is defined by CompactionKey trait.
-    //
-    type Key: CompactionKey;
-
-    type Layer: CompactionLayer<Self::Key> + Clone;
-    type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
-    type ImageLayer: CompactionImageLayer<Self> + Clone;
-
-    // This is passed through to all the interface functions. The compaction
-    // implementation doesn't do anything with it, but it might be useful for
-    // the interface implementation.
-    type RequestContext: CompactionRequestContext;
-
-    // ----
-    // Functions that the planner uses to support its decisions
-    // ----
-
-    /// Return all layers that overlap the given bounding box.
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>>;
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn: Lsn,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
-
-    /// NB: This is a pretty expensive operation. In the real pageserver
-    /// implementation, it downloads the layer, and keeps it resident
-    /// until the DeltaLayer is dropped.
-    async fn downcast_delta_layer(
-        &self,
-        layer: &Self::Layer,
-    ) -> anyhow::Result<Option<Self::DeltaLayer>>;
-
-    // ----
-    // Functions to execute the plan
-    // ----
-
-    /// Create a new image layer, materializing all the values in the key range,
-    /// at given 'lsn'.
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Self::Key>,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-
-    /// Create a new delta layer, containing all the values from 'input_layers'
-    /// in the given key and LSN range.
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Self::Key>,
-        input_layers: &[Self::DeltaLayer],
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-
-    /// Delete a layer. The compaction implementation will call this only after
-    /// all the create_image() or create_delta() calls that deletion of this
-    /// layer depends on have finished. But if the implementor has extra lazy
-    /// background tasks, like uploading the index json file to remote storage,
-    /// it is the implemenation's responsibility to track those.
-    async fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        ctx: &Self::RequestContext,
-    ) -> anyhow::Result<()>;
-}
-
-pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
-    const MIN: Self;
-    const MAX: Self;
-
-    /// Calculate distance between key_range.start and key_range.end.
-    ///
-    /// This returns u32, for compatibility with Repository::key. If the
-    /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>) -> u32;
-
-    // return "self + 1"
-    fn next(&self) -> Self;
-
-    // return "self + <some decent amount to skip>". The amount to skip
-    // is left to the implementation.
-    // FIXME: why not just "add(u32)" ?  This is hard to use
-    fn skip_some(&self) -> Self;
-}
-
-/// Contiguous ranges of keys that belong to the key space. In key order, and
-/// with no overlap.
-pub type CompactionKeySpace<K> = Vec<Range<K>>;
-
-/// Functions needed from all layers.
-pub trait CompactionLayer<K: CompactionKey + ?Sized> {
-    fn key_range(&self) -> &Range<K>;
-    fn lsn_range(&self) -> &Range<Lsn>;
-
-    fn file_size(&self) -> u64;
-
-    /// For debugging, short human-readable representation of the layer. E.g. filename.
-    fn short_id(&self) -> String;
-
-    fn is_delta(&self) -> bool;
-}
-
-#[async_trait]
-pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
-    type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
-    where
-        Self: 'a;
-
-    /// Return all keys in this delta layer.
-    async fn load_keys<'a>(
-        &self,
-        ctx: &E::RequestContext,
-    ) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
-}
-
-pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
-
-pub trait CompactionDeltaEntry<'a, K> {
-    fn key(&self) -> K;
-    fn lsn(&self) -> Lsn;
-    fn size(&self) -> u64;
-}
-
-pub trait CompactionRequestContext {}
--- a/pageserver/compaction/src/lib.rs
+++ b/pageserver/compaction/src/lib.rs
@@ -1,12 +0,0 @@
-// The main module implementing the compaction algorithm
-pub mod compact_tiered;
-pub(crate) mod identify_levels;
-
-// Traits that the caller of the compaction needs to implement
-pub mod interface;
-
-// Utility functions, useful for the implementation
-pub mod helpers;
-
-// A simulator with mock implementations of 'interface'
-pub mod simulator;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -1,613 +0,0 @@
-mod draw;
-
-use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
-
-use async_trait::async_trait;
-use futures::StreamExt;
-use rand::Rng;
-use tracing::info;
-
-use utils::lsn::Lsn;
-
-use std::fmt::Write;
-use std::ops::Range;
-use std::sync::Arc;
-use std::sync::Mutex;
-
-use crate::helpers::{merge_delta_keys, overlaps_with};
-
-use crate::interface;
-use crate::interface::CompactionLayer;
-
-//
-// Implementation for the CompactionExecutor interface
-//
-pub struct MockTimeline {
-    // Parameters for the compaction algorithm
-    pub target_file_size: u64,
-    tiers_per_level: u64,
-
-    num_l0_flushes: u64,
-    last_compact_at_flush: u64,
-    last_flush_lsn: Lsn,
-
-    // In-memory layer
-    records: Vec<MockRecord>,
-    total_len: u64,
-    start_lsn: Lsn,
-    end_lsn: Lsn,
-
-    // Current keyspace at `end_lsn`. This is updated on every ingested record.
-    keyspace: KeySpace,
-
-    // historic keyspaces
-    old_keyspaces: Vec<(Lsn, KeySpace)>,
-
-    // "on-disk" layers
-    pub live_layers: Vec<MockLayer>,
-
-    num_deleted_layers: u64,
-
-    // Statistics
-    wal_ingested: u64,
-    bytes_written: u64,
-    bytes_deleted: u64,
-    layers_created: u64,
-    layers_deleted: u64,
-
-    // All the events - creation and deletion of files - are collected
-    // in 'history'. It is used to draw the SVG animation at the end.
-    time: u64,
-    history: Vec<draw::LayerTraceEvent>,
-}
-
-type KeySpace = interface::CompactionKeySpace<Key>;
-
-pub struct MockRequestContext {}
-impl interface::CompactionRequestContext for MockRequestContext {}
-
-pub type Key = u64;
-
-impl interface::CompactionKey for Key {
-    const MIN: Self = u64::MIN;
-    const MAX: Self = u64::MAX;
-
-    fn key_range_size(key_range: &Range<Self>) -> u32 {
-        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
-    }
-
-    fn next(&self) -> Self {
-        self + 1
-    }
-    fn skip_some(&self) -> Self {
-        // round up to next xx
-        self + 100
-    }
-}
-
-#[derive(Clone)]
-pub struct MockRecord {
-    lsn: Lsn,
-    key: Key,
-    len: u64,
-}
-
-impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
-    fn key(&self) -> Key {
-        self.key
-    }
-    fn lsn(&self) -> Lsn {
-        self.lsn
-    }
-    fn size(&self) -> u64 {
-        self.len
-    }
-}
-
-pub struct MockDeltaLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-
-    pub records: Vec<MockRecord>,
-}
-
-impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}-{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        true
-    }
-}
-
-#[async_trait]
-impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
-    type DeltaEntry<'a> = MockRecord;
-
-    async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
-        Ok(self.records.clone())
-    }
-}
-
-pub struct MockImageLayer {
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-
-    pub file_size: u64,
-
-    pub deleted: Mutex<bool>,
-}
-
-impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
-
-impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
-    fn key_range(&self) -> &Range<Key> {
-        &self.key_range
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        &self.lsn_range
-    }
-
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
-    fn short_id(&self) -> String {
-        format!(
-            "{:016X}-{:016X}__{:08X}",
-            self.key_range.start, self.key_range.end, self.lsn_range.start.0,
-        )
-    }
-
-    fn is_delta(&self) -> bool {
-        false
-    }
-}
-
-impl MockTimeline {
-    pub fn new() -> Self {
-        MockTimeline {
-            target_file_size: 256 * 1024 * 1024,
-            tiers_per_level: 4,
-
-            num_l0_flushes: 0,
-            last_compact_at_flush: 0,
-            last_flush_lsn: Lsn(0),
-
-            records: Vec::new(),
-            total_len: 0,
-            start_lsn: Lsn(1000),
-            end_lsn: Lsn(1000),
-            keyspace: KeySpace::new(),
-
-            old_keyspaces: vec![],
-
-            live_layers: vec![],
-
-            num_deleted_layers: 0,
-
-            wal_ingested: 0,
-            bytes_written: 0,
-            bytes_deleted: 0,
-            layers_created: 0,
-            layers_deleted: 0,
-
-            time: 0,
-            history: Vec::new(),
-        }
-    }
-
-    pub async fn compact(&mut self) -> anyhow::Result<()> {
-        let ctx = MockRequestContext {};
-
-        crate::compact_tiered::compact_tiered(
-            self,
-            self.last_flush_lsn,
-            self.target_file_size,
-            self.tiers_per_level,
-            &ctx,
-        )
-        .await?;
-
-        Ok(())
-    }
-
-    // Ingest one record to the timeline
-    pub fn ingest_record(&mut self, key: Key, len: u64) {
-        self.records.push(MockRecord {
-            lsn: self.end_lsn,
-            key,
-            len,
-        });
-        self.total_len += len;
-        self.end_lsn += len;
-
-        if self.total_len > self.target_file_size {
-            self.flush_l0();
-        }
-    }
-
-    pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
-        if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
-            self.compact().await?;
-            self.last_compact_at_flush = self.num_l0_flushes;
-        }
-        Ok(())
-    }
-
-    pub fn flush_l0(&mut self) {
-        if self.records.is_empty() {
-            return;
-        }
-
-        let mut records = std::mem::take(&mut self.records);
-        records.sort_by_key(|rec| rec.key);
-
-        let lsn_range = self.start_lsn..self.end_lsn;
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: Key::MIN..Key::MAX,
-            lsn_range: lsn_range.clone(),
-            file_size: self.total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!("flushed L0 layer {}", new_layer.short_id());
-        self.live_layers.push(MockLayer::from(&new_layer));
-
-        // reset L0
-        self.start_lsn = self.end_lsn;
-        self.total_len = 0;
-        self.records = Vec::new();
-
-        self.layers_created += 1;
-        self.bytes_written += new_layer.file_size;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Flush,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        self.num_l0_flushes += 1;
-        self.last_flush_lsn = self.end_lsn;
-    }
-
-    // Ingest `num_records' records to the timeline, with random keys
-    // uniformly distributed in `key_range`
-    pub fn ingest_uniform(
-        &mut self,
-        num_records: u64,
-        len: u64,
-        key_range: &Range<Key>,
-    ) -> anyhow::Result<()> {
-        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
-        let mut rng = rand::thread_rng();
-        for _ in 0..num_records {
-            self.ingest_record(rng.gen_range(key_range.clone()), len);
-            self.wal_ingested += len;
-        }
-        Ok(())
-    }
-
-    pub fn print_stats(&self) -> anyhow::Result<String> {
-        let mut s = String::new();
-
-        writeln!(s, "STATISTICS:")?;
-        writeln!(
-            s,
-            "WAL ingested:   {:>10} MB",
-            self.wal_ingested / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size created:   {:>10} MB",
-            self.bytes_written / (1024 * 1024)
-        )?;
-        writeln!(
-            s,
-            "size deleted:   {:>10} MB",
-            self.bytes_deleted / (1024 * 1024)
-        )?;
-        writeln!(s, "files created:     {:>10}", self.layers_created)?;
-        writeln!(s, "files deleted:     {:>10}", self.layers_deleted)?;
-        writeln!(
-            s,
-            "write amp:         {:>10.2}",
-            self.bytes_written as f64 / self.wal_ingested as f64
-        )?;
-        writeln!(
-            s,
-            "storage amp:       {:>10.2}",
-            (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
-        )?;
-
-        Ok(s)
-    }
-
-    pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
-        draw::draw_history(&self.history, output)
-    }
-}
-
-impl Default for MockTimeline {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Clone)]
-pub enum MockLayer {
-    Delta(Arc<MockDeltaLayer>),
-    Image(Arc<MockImageLayer>),
-}
-
-impl interface::CompactionLayer<Key> for MockLayer {
-    fn key_range(&self) -> &Range<Key> {
-        match self {
-            MockLayer::Delta(this) => this.key_range(),
-            MockLayer::Image(this) => this.key_range(),
-        }
-    }
-    fn lsn_range(&self) -> &Range<Lsn> {
-        match self {
-            MockLayer::Delta(this) => this.lsn_range(),
-            MockLayer::Image(this) => this.lsn_range(),
-        }
-    }
-    fn file_size(&self) -> u64 {
-        match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
-        }
-    }
-    fn short_id(&self) -> String {
-        match self {
-            MockLayer::Delta(this) => this.short_id(),
-            MockLayer::Image(this) => this.short_id(),
-        }
-    }
-
-    fn is_delta(&self) -> bool {
-        match self {
-            MockLayer::Delta(_) => true,
-            MockLayer::Image(_) => false,
-        }
-    }
-}
-
-impl MockLayer {
-    fn is_deleted(&self) -> bool {
-        let guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        *guard
-    }
-    fn mark_deleted(&self) {
-        let mut deleted_guard = match self {
-            MockLayer::Delta(this) => this.deleted.lock().unwrap(),
-            MockLayer::Image(this) => this.deleted.lock().unwrap(),
-        };
-        assert!(!*deleted_guard, "layer already deleted");
-        *deleted_guard = true;
-    }
-}
-
-impl From<&Arc<MockDeltaLayer>> for MockLayer {
-    fn from(l: &Arc<MockDeltaLayer>) -> Self {
-        MockLayer::Delta(l.clone())
-    }
-}
-
-impl From<&Arc<MockImageLayer>> for MockLayer {
-    fn from(l: &Arc<MockImageLayer>) -> Self {
-        MockLayer::Image(l.clone())
-    }
-}
-
-#[async_trait]
-impl interface::CompactionJobExecutor for MockTimeline {
-    type Key = Key;
-    type Layer = MockLayer;
-    type DeltaLayer = Arc<MockDeltaLayer>;
-    type ImageLayer = Arc<MockImageLayer>;
-    type RequestContext = MockRequestContext;
-
-    async fn get_layers(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        lsn_range: &Range<Lsn>,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<Vec<Self::Layer>> {
-        // Clear any deleted layers from our vec
-        self.live_layers.retain(|l| !l.is_deleted());
-
-        let layers: Vec<MockLayer> = self
-            .live_layers
-            .iter()
-            .filter(|l| {
-                overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
-            })
-            .cloned()
-            .collect();
-
-        Ok(layers)
-    }
-
-    async fn get_keyspace(
-        &mut self,
-        key_range: &Range<Self::Key>,
-        _lsn: Lsn,
-        _ctx: &Self::RequestContext,
-    ) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
-        // find it in the levels
-        if self.old_keyspaces.is_empty() {
-            Ok(crate::helpers::intersect_keyspace(
-                &self.keyspace,
-                key_range,
-            ))
-        } else {
-            // not implemented
-
-            // The mock implementation only allows requesting the
-            // keyspace at the level's end LSN. That's all that the
-            // current implementation needs.
-            panic!("keyspace not available for requested lsn");
-        }
-    }
-
-    async fn downcast_delta_layer(
-        &self,
-        layer: &MockLayer,
-    ) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
-        Ok(match layer {
-            MockLayer::Delta(l) => Some(l.clone()),
-            MockLayer::Image(_) => None,
-        })
-    }
-
-    async fn create_image(
-        &mut self,
-        lsn: Lsn,
-        key_range: &Range<Key>,
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
-
-        let mut accum_size: u64 = 0;
-        for r in keyspace {
-            accum_size += r.end - r.start;
-        }
-
-        let new_layer = Arc::new(MockImageLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created image layer, size {}: {}",
-            new_layer.file_size,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Image(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += new_layer.file_size;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateImage,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn create_delta(
-        &mut self,
-        lsn_range: &Range<Lsn>,
-        key_range: &Range<Key>,
-        input_layers: &[Arc<MockDeltaLayer>],
-        ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let mut key_value_stream =
-            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
-        let mut records: Vec<MockRecord> = Vec::new();
-        let mut total_len = 2;
-        while let Some(delta_entry) = key_value_stream.next().await {
-            let delta_entry: MockRecord = delta_entry?;
-            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
-                total_len += delta_entry.len;
-                records.push(delta_entry);
-            }
-        }
-        let total_records = records.len();
-        let new_layer = Arc::new(MockDeltaLayer {
-            key_range: key_range.clone(),
-            lsn_range: lsn_range.clone(),
-            file_size: total_len,
-            records,
-            deleted: Mutex::new(false),
-        });
-        info!(
-            "created delta layer, recs {}, size {}: {}",
-            total_records,
-            total_len,
-            new_layer.short_id()
-        );
-        self.live_layers.push(MockLayer::Delta(new_layer.clone()));
-
-        // update stats
-        self.bytes_written += total_len;
-        self.layers_created += 1;
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::CreateDelta,
-            file: LayerTraceFile {
-                filename: new_layer.short_id(),
-                key_range: new_layer.key_range.clone(),
-                lsn_range: new_layer.lsn_range.clone(),
-            },
-        });
-
-        Ok(())
-    }
-
-    async fn delete_layer(
-        &mut self,
-        layer: &Self::Layer,
-        _ctx: &MockRequestContext,
-    ) -> anyhow::Result<()> {
-        let layer = std::pin::pin!(layer);
-        info!("deleting layer: {}", layer.short_id());
-        self.num_deleted_layers += 1;
-        self.bytes_deleted += layer.file_size();
-        layer.mark_deleted();
-
-        self.time += 1;
-        self.history.push(LayerTraceEvent {
-            time_rel: self.time,
-            op: LayerTraceOp::Delete,
-            file: LayerTraceFile {
-                filename: layer.short_id(),
-                key_range: layer.key_range().clone(),
-                lsn_range: layer.lsn_range().clone(),
-            },
-        });
-
-        Ok(())
-    }
-}
--- a/pageserver/compaction/src/simulator/draw.rs
+++ b/pageserver/compaction/src/simulator/draw.rs
@@ -1,411 +0,0 @@
-use super::Key;
-use anyhow::Result;
-use std::cmp::Ordering;
-use std::{
-    collections::{BTreeMap, BTreeSet, HashSet},
-    fmt::Write,
-    ops::Range,
-};
-use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
-use utils::lsn::Lsn;
-
-// Map values to their compressed coordinate - the index the value
-// would have in a sorted and deduplicated list of all values.
-struct CoordinateMap<T: Ord + Copy> {
-    map: BTreeMap<T, usize>,
-    stretch: f32,
-}
-
-impl<T: Ord + Copy> CoordinateMap<T> {
-    fn new(coords: Vec<T>, stretch: f32) -> Self {
-        let set: BTreeSet<T> = coords.into_iter().collect();
-
-        let mut map: BTreeMap<T, usize> = BTreeMap::new();
-        for (i, e) in set.iter().enumerate() {
-            map.insert(*e, i);
-        }
-
-        Self { map, stretch }
-    }
-
-    // This assumes that the map contains an exact point for this.
-    // Use map_inexact for values inbetween
-    fn map(&self, val: T) -> f32 {
-        *self.map.get(&val).unwrap() as f32 * self.stretch
-    }
-
-    // the value is still assumed to be within the min/max bounds
-    // (this is currently unused)
-    fn _map_inexact(&self, val: T) -> f32 {
-        let prev = *self.map.range(..=val).next().unwrap().1;
-        let next = *self.map.range(val..).next().unwrap().1;
-
-        // interpolate
-        (prev as f32 + (next - prev) as f32) * self.stretch
-    }
-
-    fn max(&self) -> f32 {
-        self.map.len() as f32 * self.stretch
-    }
-}
-
-#[derive(PartialEq, Hash, Eq)]
-pub enum LayerTraceOp {
-    Flush,
-    CreateDelta,
-    CreateImage,
-    Delete,
-}
-
-impl std::fmt::Display for LayerTraceOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        let op_str = match self {
-            LayerTraceOp::Flush => "flush",
-            LayerTraceOp::CreateDelta => "create_delta",
-            LayerTraceOp::CreateImage => "create_image",
-            LayerTraceOp::Delete => "delete",
-        };
-        f.write_str(op_str)
-    }
-}
-
-#[derive(PartialEq, Hash, Eq, Clone)]
-pub struct LayerTraceFile {
-    pub filename: String,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
-}
-
-impl LayerTraceFile {
-    fn is_image(&self) -> bool {
-        self.lsn_range.end == self.lsn_range.start
-    }
-}
-
-pub struct LayerTraceEvent {
-    pub time_rel: u64,
-    pub op: LayerTraceOp,
-    pub file: LayerTraceFile,
-}
-
-pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
-    let mut files: Vec<LayerTraceFile> = Vec::new();
-
-    for event in history {
-        files.push(event.file.clone());
-    }
-    let last_time_rel = history.last().unwrap().time_rel;
-
-    // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
-    for f in files.iter() {
-        keys.push(f.key_range.start);
-        keys.push(f.key_range.end);
-        lsns.push(f.lsn_range.start);
-        lsns.push(f.lsn_range.end);
-    }
-
-    // Analyze
-    let key_map = CoordinateMap::new(keys, 2.0);
-    // Stretch out vertically for better visibility
-    let lsn_map = CoordinateMap::new(lsns, 3.0);
-
-    let mut svg = String::new();
-
-    // Draw
-    writeln!(
-        svg,
-        "{}",
-        BeginSvg {
-            w: key_map.max(),
-            h: lsn_map.max(),
-        }
-    )?;
-    let lsn_max = lsn_map.max();
-
-    // Sort the files by LSN, but so that image layers go after all delta layers
-    // The SVG is painted in the order the elements appear, and we want to draw
-    // image layers on top of the delta layers if they overlap
-    //
-    // (This could also be implemented via z coordinates: image layers get one z
-    // coord, delta layers get another z coord.)
-    let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
-    files_sorted.sort_by(|a, b| {
-        if a.is_image() && !b.is_image() {
-            Ordering::Greater
-        } else if !a.is_image() && b.is_image() {
-            Ordering::Less
-        } else {
-            a.lsn_range.end.cmp(&b.lsn_range.end)
-        }
-    });
-
-    writeln!(svg, "<!-- layers -->")?;
-    let mut files_seen = HashSet::new();
-    for f in files_sorted {
-        if files_seen.contains(&f) {
-            continue;
-        }
-        let key_start = key_map.map(f.key_range.start);
-        let key_end = key_map.map(f.key_range.end);
-        let key_diff = key_end - key_start;
-
-        if key_start >= key_end {
-            panic!("Invalid key range {}-{}", key_start, key_end);
-        }
-
-        let lsn_start = lsn_map.map(f.lsn_range.start);
-        let lsn_end = lsn_map.map(f.lsn_range.end);
-
-        // Fill in and thicken rectangle if it's an
-        // image layer so that we can see it.
-        let mut style = Style::default();
-        style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-        style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
-
-        let y_start = lsn_max - lsn_start;
-        let y_end = lsn_max - lsn_end;
-
-        let x_margin = 0.25;
-        let y_margin = 0.5;
-
-        match f.lsn_range.start.cmp(&f.lsn_range.end) {
-            Ordering::Less => {
-                write!(
-                    svg,
-                    r#"    <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end + y_margin,
-                    key_diff - x_margin * 2.0,
-                    y_start - y_end - y_margin * 2.0,
-                    1.0, // border_radius,
-                    style,
-                )?;
-                write!(svg, "<title>{}</title>", f.filename)?;
-                writeln!(svg, "</rect>")?;
-            }
-            Ordering::Equal => {
-                //lsn_diff = 0.3;
-                //lsn_offset = -lsn_diff / 2.0;
-                //margin = 0.05;
-                style.fill = Fill::Color(rgb(0x80, 0, 0x80));
-                style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
-                write!(
-                    svg,
-                    r#"    <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
-                    f.filename,
-                    key_start + x_margin,
-                    y_end,
-                    key_end - x_margin,
-                    y_end,
-                    style,
-                )?;
-                write!(
-                    svg,
-                    "<title>{}<br>{} - {}</title>",
-                    f.filename, lsn_end, y_end
-                )?;
-                writeln!(svg, "</line>")?;
-            }
-            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
-        }
-        files_seen.insert(f);
-    }
-
-    let mut record_style = Style::default();
-    record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
-    record_style.stroke = Stroke::None;
-
-    writeln!(svg, "{}", EndSvg)?;
-
-    let mut layer_events_str = String::new();
-    let mut first = true;
-    for e in history {
-        if !first {
-            writeln!(layer_events_str, ",")?;
-        }
-        write!(
-            layer_events_str,
-            r#"  {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
-            e.time_rel, e.file.filename, e.op
-        )?;
-        first = false;
-    }
-    writeln!(layer_events_str)?;
-
-    writeln!(
-        output,
-        r#"<!DOCTYPE html>
-<html>
-<head>
-<style>
-/* Keep the slider pinned at top */
-.topbar {{
-  display: block;
-  overflow: hidden;
-  background-color: lightgrey;
-  position: fixed;
-  top: 0;
-  width: 100%;
-/*  width: 500px; */
-}}
-.slidercontainer {{
-  float: left;
-  width: 50%;
-  margin-right: 200px;
-}}
-.slider {{
-  float: left;
-  width: 100%;
-}}
-.legend {{
-  width: 200px;
-  float: right;
-}}
-
-/* Main content */
-.main {{
-  margin-top: 50px; /* Add a top margin to avoid content overlay */
-}}
-</style>
-</head>
-
-  <body onload="init()">
-    <script type="text/javascript">
-
-      var layer_events = [{layer_events_str}]
-
-      let ticker;
-
-      function init() {{
-          for (let i = 0; i < layer_events.length; i++) {{
-              var layer = document.getElementById("layer_" + layer_events[i].filename);
-              layer.style.visibility = "hidden";
-          }}
-          last_layer_event = -1;
-          moveSlider(last_slider_pos)
-      }}
-
-      function startAnimation() {{
-          ticker = setInterval(animateStep, 100);
-      }}
-      function stopAnimation() {{
-          clearInterval(ticker);
-      }}
-
-      function animateStep() {{
-          if (last_layer_event < layer_events.length - 1) {{
-              var slider = document.getElementById("time-slider");
-              let prevPos = slider.value
-              let nextEvent = last_layer_event + 1
-              while (nextEvent <= layer_events.length - 1) {{
-                  if (layer_events[nextEvent].time_rel > prevPos) {{
-                      break;
-                  }}
-                  nextEvent += 1;
-              }}
-              let nextPos = layer_events[nextEvent].time_rel
-              slider.value = nextPos
-              moveSlider(nextPos)
-          }}
-      }}
-
-      function redoLayerEvent(n, dir) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "visible";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "visible";
-                  break;
-              case "delete":
-                  layer.style.visibility = "hidden";
-                  break;
-          }}
-      }}
-      function undoLayerEvent(n) {{
-          var layer = document.getElementById("layer_" + layer_events[n].filename);
-          switch (layer_events[n].op) {{
-              case "flush":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_delta":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "create_image":
-                  layer.style.visibility = "hidden";
-                  break;
-              case "delete":
-                  layer.style.visibility = "visible";
-                  break;
-          }}
-      }}
-
-      var last_slider_pos = 0
-      var last_layer_event = 0
-
-      var moveSlider = function(new_pos) {{
-          if (new_pos > last_slider_pos) {{
-              while (last_layer_event < layer_events.length - 1) {{
-                  if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
-                      break;
-                  }}
-                  last_layer_event += 1;
-                  redoLayerEvent(last_layer_event)
-              }}
-          }}
-          if (new_pos < last_slider_pos) {{
-              while (last_layer_event >= 0) {{
-                  if (layer_events[last_layer_event].time_rel <= new_pos) {{
-                      break;
-                  }}
-                  undoLayerEvent(last_layer_event)
-                  last_layer_event -= 1;
-              }}
-          }}
-          last_slider_pos = new_pos;
-          document.getElementById("debug_pos").textContent=new_pos;
-          if (last_layer_event >= 0) {{
-              document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
-          }} else {{
-              document.getElementById("debug_layer_event").textContent="begin";
-          }}
-      }}
-    </script>
-
-    <div class="topbar">
-      <div class="slidercontainer">
-        <label for="time-slider">TIME</label>:
-        <input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
-
-        pos: <span id="debug_pos"></span><br>
-        event: <span id="debug_layer_event"></span><br>
-        gc: <span id="debug_gc_event"></span><br>
-      </div>
-
-      <button onclick="startAnimation()">Play</button>
-      <button onclick="stopAnimation()">Stop</button>
-
-      <svg class="legend">
-        <rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
-        <line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
-      </svg>
-    </div>
-
-    <div class="main">
-{svg}
-    </div>
-  </body>
-</html>
-"#
-    )?;
-
-    Ok(())
-}
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,37 +0,0 @@
-use pageserver_compaction::interface::CompactionLayer;
-use pageserver_compaction::simulator::MockTimeline;
-
-/// Test the extreme case that there are so many updates for a single key that
-/// even if we produce an extremely narrow delta layer, spanning just that one
-/// key, we still too many records to fit in the target file size. We need to
-/// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
-#[tokio::test]
-async fn test_many_updates_for_single_key() -> anyhow::Result<()> {
-    let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
-
-    // Ingest 100 MB of updates to a single key.
-    for _ in 1..1000 {
-        executor.ingest_uniform(100, 10, &(0..100_000))?;
-        executor.ingest_uniform(10_000, 10, &(0..1))?;
-        executor.compact().await?;
-    }
-
-    // Check that all the layers are smaller than the target size (with some slop)
-    for l in executor.live_layers.iter() {
-        println!("layer {}: {}", l.short_id(), l.file_size());
-    }
-    for l in executor.live_layers.iter() {
-        assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
-        if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
-        }
-    }
-
-    Ok(())
-}
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -18,3 +18,5 @@ tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
+serde.workspace = true
+serde_json.workspace = true
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -0,0 +1,38 @@
+use std::collections::HashMap;
+
+use anyhow::Context;
+use camino::Utf8PathBuf;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
+use utils::lsn::Lsn;
+
+#[derive(clap::Subcommand)]
+pub(crate) enum IndexPartCmd {
+    Dump { path: Utf8PathBuf },
+}
+
+pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
+    match cmd {
+        IndexPartCmd::Dump { path } => {
+            let bytes = tokio::fs::read(path).await.context("read file")?;
+            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
+            #[derive(serde::Serialize)]
+            struct Output<'a> {
+                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
+                disk_consistent_lsn: Lsn,
+                timeline_metadata: &'a TimelineMetadata,
+            }
+
+            let output = Output {
+                layer_metadata: &des.layer_metadata,
+                disk_consistent_lsn: des.get_disk_consistent_lsn(),
+                timeline_metadata: &des.metadata,
+            };
+
+            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
+            println!("{output}");
+            Ok(())
+        }
+    }
+}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -5,11 +5,13 @@
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.

 mod draw_timeline_dir;
+mod index_part;
 mod layer_map_analyzer;
 mod layers;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
    context::{DownloadBehavior, RequestContext},
@@ -38,6 +40,8 @@ struct CliOpts {
 #[derive(Subcommand)]
 enum Commands {
    Metadata(MetadataCmd),
+    #[command(subcommand)]
+    IndexPart(IndexPartCmd),
    PrintLayerFile(PrintLayerFileCmd),
    DrawTimeline {},
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
@@ -83,6 +87,9 @@ async fn main() -> anyhow::Result<()> {
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
        }
+        Commands::IndexPart(cmd) => {
+            index_part::main(&cmd).await?;
+        }
        Commands::DrawTimeline {} => {
            draw_timeline_dir::main()?;
        }
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,22 +1,21 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => {
-            bail!("SafekeeperData scope makes no sense for Pageserver")
-        }
+        (Scope::SafekeeperData, _) => Err(AuthError(
+            "SafekeeperData scope makes no sense for Pageserver".into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::{JwtAuth, SwappableJwtAuth},
+    logging, project_build_tag, project_git_version,
+    sentry_init::init_sentry,
+    signals::Signal,
+    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -100,7 +103,11 @@ fn main() -> anyhow::Result<()> {
    } else {
        TracingErrorLayerEnablement::Disabled
    };
-    logging::init(conf.log_format, tracing_error_layer_enablement)?;
+    logging::init(
+        conf.log_format,
+        tracing_error_layer_enablement,
+        logging::Output::Stdout,
+    )?;

    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
    // disarming this hook on pageserver, because we never tear down tracing.
@@ -321,13 +328,12 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        // unwrap is ok because check is performed when creating config, so path is set and exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!(
-            "Loading public key for verifying JWT tokens from {:#?}",
-            key_path
-        );
-        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
+        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+
+        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -410,7 +416,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -420,6 +426,7 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
+    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -548,6 +555,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
@@ -617,6 +625,7 @@ fn start_pageserver(
                    conf.synthetic_size_calculation_interval,
                    conf.id,
                    local_disk_storage,
+                    cancel,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file containing public key for verifying JWT tokens.
+    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

@@ -880,13 +880,6 @@ impl PageServerConf {
            );
        }

-        if let Some(compaction_algorithm) = item.get("compaction_algorithm") {
-            t_conf.compaction_algorithm = Some(
-                deserialize_from_item("compaction_algorithm", compaction_algorithm)
-                    .context("parse compaction_algorithm")?,
-            );
-        }
-
        if let Some(gc_horizon) = item.get("gc_horizon") {
            t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
        }
@@ -1321,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs: NonZeroUsize::new(
-                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
-                    )
-                        .unwrap(),
-                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
-                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1387,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs,
-                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -12,11 +12,12 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;

 mod metrics;
-use crate::consumption_metrics::metrics::MetricsKey;
+use metrics::MetricsKey;
 mod disk_cache;
 mod upload;

@@ -37,6 +38,7 @@ type RawMetric = (MetricsKey, (EventType, u64));
 type Cache = HashMap<MetricsKey, (EventType, u64)>;

 /// Main thread that serves metrics collection
+#[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
@@ -44,6 +46,7 @@ pub async fn collect_metrics(
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
+    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
    if _cached_metric_collection_interval != Duration::ZERO {
@@ -63,9 +66,13 @@ pub async fn collect_metrics(
        "synthetic size calculation",
        false,
        async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
+            calculate_synthetic_size_worker(
+                synthetic_size_calculation_interval,
+                &cancel,
+                &worker_ctx,
+            )
+            .instrument(info_span!("synthetic_size_worker"))
+            .await?;
            Ok(())
        },
    );
@@ -241,6 +248,7 @@ async fn reschedule(
 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
+    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
@@ -266,13 +274,18 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
                // which turns out is really handy to understand the system.
-                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
+                if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
+                    if let Some(PageReconstructError::Cancelled) =
+                        e.downcast_ref::<PageReconstructError>()
+                    {
+                        return Ok(());
+                    }
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
            }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -3,7 +3,6 @@ use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
-use serde_with::serde_as;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -42,13 +41,10 @@ pub(super) enum Name {
 ///
 /// This is a denormalization done at the MetricsKey const methods; these should not be constructed
 /// elsewhere.
-#[serde_with::serde_as]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(crate) struct MetricsKey {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,

-    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,

@@ -206,7 +202,6 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
-                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,5 +1,4 @@
 use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
-use serde_with::serde_as;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;

@@ -7,12 +6,9 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
 use utils::id::{TenantId, TimelineId};

 /// How the metrics from pageserver are identified.
-#[serde_with::serde_as]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
 struct Ids {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub(super) tenant_id: TenantId,
-    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub(super) timeline_id: Option<TimelineId>,
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -18,7 +18,6 @@ use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
-use serde_with::serde_as;
 use thiserror::Error;
 use tokio;
 use tokio_util::sync::CancellationToken;
@@ -215,7 +214,6 @@ where
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";

-#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionList {
    /// Serialization version, for future use
@@ -244,7 +242,6 @@ struct DeletionList {
    validated: bool,
 }

-#[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
 struct DeletionHeader {
    /// Serialization version, for future use
@@ -516,6 +513,7 @@ impl DeletionQueueClient {
    ) -> Result<(), DeletionQueueError> {
        if current_generation.is_none() {
            debug!("Enqueuing deletions in legacy mode, skipping queue");
+
            let mut layer_paths = Vec::new();
            for (layer, generation) in layers {
                layer_paths.push(remote_layer_path(
@@ -896,14 +894,6 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,21 +55,24 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            || async {
+                fail::fail_point!("deletion-queue-before-execute", |_| {
+                    info!("Skipping execution, failpoint set");
+
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
+                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
+                });
+
+                self.remote_storage.delete_objects(&self.accumulator).await
+            },
            |_| false,
            3,
            10,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -403,7 +403,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    return (evicted_bytes, evictions_failed);
                };

-                let results = timeline.evict_layers(&batch, &cancel).await;
+                let results = timeline.evict_layers(&batch).await;

                match results {
                    Ok(results) => {
@@ -545,7 +545,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -554,6 +554,11 @@ async fn collect_eviction_candidates(
            }
        };

+        if tenant.cancel.is_cancelled() {
+            info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
+            continue;
+        }
+
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,6 +52,31 @@ paths:
              schema:
                type: object

+  /v1/reload_auth_validation_keys:
+    post:
+      description: Reloads the JWT public keys from their pre-configured location on disk.
+      responses:
+        "200":
+          description: The reload completed successfully.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error (also hits if no keys were found)
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -327,7 +352,8 @@ paths:
          in: query
          required: true
          schema:
-            type: integer
+            type: string
+            format: hex
          description: A LSN to get the timestamp
      responses:
        "200":
--- a/Show More
+++ b/Show More