only for console

proxy console force http2
improve proxy code cov (#6141 )
2026-03-15 06:10:36 +00:00 · 2023-12-15 12:28:50 +00:00 · 2023-12-15 12:26:51 +00:00 · 2023-12-15 12:11:50 +00:00 · 2023-12-15 12:11:25 +00:00 · 2023-12-15 11:59:24 +00:00
297 changed files with 22763 additions and 10786 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,17 +1,3 @@
-# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
-# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
-# optimizations enabled by "opt-level=1" don't affect debuggability too much.
-#
-# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
-#
-[profile.dev.package."*"]
-# Set the default for dependencies in Development mode.
-opt-level = 3
-
-[profile.dev]
-# Turn on a small amount of optimization in Development mode.
-opt-level = 1
-
 [build]
 # This is only present for local builds, as it will be overridden
 # by the RUSTDOCFLAGS env var in CI.
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,8 +17,9 @@ assignees: ''
 ## Implementation ideas


-## Tasks
- [ ]
+```[tasklist]
+### Tasks
+```


 ## Other related tasks and Epics
--- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release-pr.md
@@ -3,7 +3,7 @@
 **NB: this PR must be merged only by 'Create a merge commit'!**

 ### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
+- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b)
 - [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
 - [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?

--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,5 +1,7 @@
 self-hosted-runner:
  labels:
+    - arm64
+    - dev
    - gen3
    - large
    - small
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -172,10 +172,10 @@ jobs:
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
        if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

    steps:
      - name: Fix git ownership
@@ -198,6 +199,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done

      - name: Checkout
        uses: actions/checkout@v3
@@ -403,7 +408,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -435,6 +440,7 @@ jobs:
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
+          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -585,10 +591,13 @@ jobs:
        id: upload-coverage-report-new
        env:
          BUCKET: neon-github-public-dev
+          # A differential coverage report is available only for PRs.
+          # (i.e. for pushes into main/release branches we have a regular coverage report)
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
        run: |
-          BASELINE="$(git merge-base HEAD origin/main)"
          CURRENT="${COMMIT_SHA}"
+          BASELINE="$(git merge-base $BASE_SHA $CURRENT)"

          cp /tmp/coverage/report/lcov.info ./${CURRENT}.info

@@ -723,6 +732,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -847,7 +857,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.18.5
+      VM_BUILDER_VERSION: v0.19.0

    steps:
      - name: Checkout
@@ -869,8 +879,7 @@ jobs:
      - name: Build vm image
        run: |
          ./vm-builder \
-            -enable-file-cache \
-            -cgroup-uid=postgres \
+            -spec=vm-image-spec.yaml \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

@@ -1092,6 +1101,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done

      - name: Checkout
        uses: actions/checkout@v3
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -21,7 +21,10 @@ env:

 jobs:
  check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    timeout-minutes: 90
    runs-on: macos-latest

@@ -112,8 +115,200 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
  gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 6 * * 1'
  workflow_dispatch:

 jobs:
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,6 @@ test_output/
 *.o
 *.so
 *.Po
+
+# pgindent typedef lists
+*.list
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,6 +9,24 @@ refactoring, additional comments, and so forth. Let's try to raise the
 bar, and clean things up as we go. Try to leave code in a better shape
 than it was before.

+## Pre-commit hook
+
+We have a sample pre-commit hook in `pre-commit.py`.
+To set it up, run:
+
+```bash
+ln -s ../../pre-commit.py .git/hooks/pre-commit
+```
+
+This will run following checks on staged files before each commit:
+- `rustfmt`
+- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+
+There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
+and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
+
+If you want to skip the hook, run `git commit` with `--no-verify` option.
+
 ## Submitting changes

 1. Get at least one +1 on your PR before you push.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,19 +36,20 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
-azure_core = "0.16"
-azure_identity = "0.16"
-azure_storage = "0.16"
-azure_storage_blobs = "0.16"
+arc-swap = "1.6"
+async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
+azure_core = "0.18"
+azure_identity = "0.18"
+azure_storage = "0.18"
+azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.56", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.29"
-aws-smithy-http = "0.56"
-aws-credential-types = "0.56"
-aws-types = "0.56"
+aws-config = { version = "1.0", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.0"
+aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.0"
+aws-credential-types = "1.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -65,7 +66,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
+dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -81,12 +82,13 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.11"
 inotify = "0.10.2"
+ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -107,7 +109,7 @@ pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-regex = "1.4"
+regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
@@ -120,20 +122,24 @@ rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
+sd-notify = "0.4.1"
 sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
+serde_path_to_error = "0.1"
 serde_with = "2.0"
 serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
+smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
+task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
@@ -143,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7", features = ["io"] }
+tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
@@ -162,11 +168,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -203,7 +209,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ################# Binary contents sections

--- a/5
+++ b/5
@@ -27,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -393,7 +393,9 @@ RUN case "${PG_VERSION}" in \
        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
        ;; \
      *) \
-        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
+        export TIMESCALEDB_VERSION=2.13.0 \
+        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
+        ;; \
    esac && \
    apt-get update && \
    apt-get install -y cmake && \
@@ -714,6 +716,23 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

+#########################################################################################
+#
+# Layer "wal2json-build"
+# Compile "wal2json" extension
+#
+#########################################################################################
+
+FROM build-deps AS wal2json-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
+    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
+    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -750,6 +769,7 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/42
+++ b/42
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
@@ -256,6 +260,44 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace

+postgres-%-pg-bsd-indent: postgres-%
+	+@echo "Compiling pg_bsd_indent"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
+
+# Create typedef list for the core. Note that generally it should be combined with
+# buildfarm one to cover platform specific stuff.
+# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
+postgres-%-typedefs.list: postgres-%
+	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
+
+# Indent postgres. See src/tools/pgindent/README for details.
+.PHONY: postgres-%-pgindent
+postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
+	+@echo merge with buildfarm typedef to cover all platforms
+	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
+		REL_16_STABLE list misses PGSemaphoreData
+	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
+	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
+		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+	+@echo note: you might want to run it on selected files/dirs instead.
+	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
+		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
+		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
+	rm -f pg*.BAK
+
+# Indent pxgn/neon.
+.PHONY: pgindent
+neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
+		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
+		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
+
+
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/README.md
+++ b/README.md
@@ -149,6 +149,9 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
 Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one

+# create postgres compute node
+> cargo neon endpoint create main
+
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
@@ -185,8 +188,11 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 (L) main [de200bd42b49cc1814412c7e592dd6e9]
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

+# create postgres on that branch
+> cargo neon endpoint create migration_check --branch-name migration_check
+
 # start postgres on that branch
-> cargo neon endpoint start migration_check --branch-name migration_check
+> cargo neon endpoint start migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,3 +38,4 @@ toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
+bytes = "1.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,7 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
+//!             -r http://pg-ext-s3-gateway
 //! ```
 //!
 use std::collections::HashMap;
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::{get_pg_version, init_remote_storage};
+use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -60,7 +60,7 @@ use compute_tools::spec::*;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "5670669815";
+const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -74,10 +74,18 @@ fn main() -> Result<()> {
    let pgbin_default = String::from("postgres");
    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);

-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
-    let ext_remote_storage = remote_ext_config.map(|x| {
-        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
-    });
+    let ext_remote_storage = matches
+        .get_one::<String>("remote-ext-config")
+        // Compatibility hack: if the control plane specified any remote-ext-config
+        // use the default value for extension storage proxy gateway.
+        // Remove this once the control plane is updated to pass the gateway URL
+        .map(|conf| {
+            if conf.starts_with("http") {
+                conf.trim_end_matches('/')
+            } else {
+                "http://pg-ext-s3-gateway"
+            }
+        });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -198,7 +206,7 @@ fn main() -> Result<()> {
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage,
+        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -266,7 +274,13 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            drop(state);
+            // Notify others that Postgres failed to start. In case of configuring the
+            // empty compute, it's likely that API handler is still waiting for compute
+            // state change. With this we will notify it that compute is in Failed state,
+            // so control plane will know about it earlier and record proper error instead
+            // of timeout.
+            compute.state_changed.notify_all();
+            drop(state); // unlock
            delay_exit = true;
            None
        }
@@ -479,13 +493,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            // DEPRECATED, NO LONGER DOES ANYTHING.
-            // See https://github.com/neondatabase/cloud/issues/7516
-            Arg::new("file-cache-on-disk")
-                .long("file-cache-on-disk")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,10 +22,10 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
@@ -59,8 +59,8 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
-    ///  the S3 bucket that we search for extensions in
-    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    /// the address of extension storage proxy gateway
+    pub ext_remote_storage: Option<String>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -277,6 +277,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }

 impl ComputeNode {
+    /// Check that compute node has corresponding feature enabled.
+    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
+        let state = self.state.lock().unwrap();
+
+        if let Some(s) = state.pspec.as_ref() {
+            s.spec.features.contains(&feature)
+        } else {
+            false
+        }
+    }
+
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
        state.status = status;
@@ -698,6 +709,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
+        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;

        // 'Close' connection
@@ -710,8 +722,12 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
        Ok(())
    }

@@ -723,10 +739,15 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
+        // temporarily reset max_cluster_size in config
+        // to avoid the possibility of hitting the limit, while we are reconfiguring:
+        // creating new extensions, roles, etc...
+        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
@@ -738,11 +759,16 @@ impl ComputeNode {
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
+            handle_extension_neon(&mut client)?;
        }

        // 'Close' connection
        drop(client);

+        // reset max_cluster_size in config back to original value and reload config
+        config::compute_ctl_temp_override_remove(pgdata_path)?;
+        self.pg_reload_conf()?;
+
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -803,7 +829,17 @@ impl ComputeNode {

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+            self.pg_reload_conf()?;
+
            self.apply_config(&compute_state)?;
+
+            config::compute_ctl_temp_override_remove(pgdata_path)?;
+            self.pg_reload_conf()?;
        }

        let startup_end_time = Utc::now();
@@ -951,12 +987,12 @@ LIMIT 100",
        real_ext_name: String,
        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
-        let remote_storage = self
-            .ext_remote_storage
-            .as_ref()
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "Remote extensions storage is not configured",
-            )))?;
+        let ext_remote_storage =
+            self.ext_remote_storage
+                .as_ref()
+                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                    "Remote extensions storage is not configured",
+                )))?;

        let ext_archive_name = ext_path.object_name().expect("bad path");

@@ -1012,7 +1048,7 @@ LIMIT 100",
        let download_size = extension_server::download_extension(
            &real_ext_name,
            &ext_path,
-            remote_storage,
+            ext_remote_storage,
            &self.pgbin,
        )
        .await
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -93,5 +93,25 @@ pub fn write_postgres_conf(
        writeln!(file, "neon.extension_server_port={}", port)?;
    }

+    // This is essential to keep this line at the end of the file,
+    // because it is intended to override any settings above.
+    writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
+
+    Ok(())
+}
+
+/// create file compute_ctl_temp_override.conf in pgdata_dir
+/// add provided options to this file
+pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    let mut file = File::create(path)?;
+    write!(file, "{}", options)?;
+    Ok(())
+}
+
+/// remove file compute_ctl_temp_override.conf in pgdata_dir
+pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    std::fs::remove_file(path)?;
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,18 +71,16 @@ More specifically, here is an example ext_index.json
    }
 }
 */
-use anyhow::Context;
 use anyhow::{self, Result};
+use anyhow::{bail, Context};
+use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
 use regex::Regex;
 use remote_storage::*;
-use serde_json;
-use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
+use reqwest::StatusCode;
 use std::path::Path;
 use std::str;
 use tar::Archive;
-use tokio::io::AsyncReadExt;
 use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
@@ -133,67 +131,36 @@ fn parse_pg_version(human_version: &str) -> &str {
    panic!("Unsuported postgres version {human_version}");
 }

-#[cfg(test)]
-mod tests {
-    use super::parse_pg_version;
-
-    #[test]
-    fn test_parse_pg_version() {
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            "v15"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
-        assert_eq!(
-            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            "v14"
-        );
-
-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_unsupported_version() {
-        parse_pg_version("PostgreSQL 13.14");
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_parse_pg_incorrect_version_format() {
-        parse_pg_version("PostgreSQL 14");
-    }
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    remote_storage: &GenericRemoteStorage,
+    ext_remote_storage: &str,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);
-    let mut download = remote_storage.download(ext_path).await?;
-    let mut download_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut download_buffer)
-        .await?;
+
+    // TODO add retry logic
+    let download_buffer =
+        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
+            Ok(buffer) => buffer,
+            Err(error_message) => {
+                return Err(anyhow::anyhow!(
+                    "error downloading extension {:?}: {:?}",
+                    ext_name,
+                    error_message
+                ));
+            }
+        };
+
    let download_size = download_buffer.len() as u64;
+    info!("Download size {:?}", download_size);
    // it's unclear whether it is more performant to decompress into memory or not
    // TODO: decompressing into memory can be avoided
-    let mut decoder = Decoder::new(download_buffer.as_slice())?;
-    let mut decompress_buffer = Vec::new();
-    decoder.read_to_end(&mut decompress_buffer)?;
-    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let decoder = Decoder::new(download_buffer.as_ref())?;
+    let mut archive = Archive::new(decoder);
+
    let unzip_dest = pgbin
        .strip_suffix("/bin/postgres")
        .expect("bad pgbin")
@@ -261,29 +228,69 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    }
 }

-// This function initializes the necessary structs to use remote storage
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
-    #[derive(Debug, serde::Deserialize)]
-    struct RemoteExtJson {
-        bucket: String,
-        region: String,
-        endpoint: Option<String>,
-        prefix: Option<String>,
-    }
-    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+// Do request to extension storage proxy, i.e.
+// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
+// using HHTP GET
+// and return the response body as bytes
+//
+async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
+    let uri = format!("{}/{}", ext_remote_storage, ext_path);

-    let config = S3Config {
-        bucket_name: remote_ext_json.bucket,
-        bucket_region: remote_ext_json.region,
-        prefix_in_bucket: remote_ext_json.prefix,
-        endpoint: remote_ext_json.endpoint,
-        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
-        max_keys_per_list_response: None,
-    };
-    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
-        storage: RemoteStorageKind::AwsS3(config),
-    };
-    GenericRemoteStorage::from_config(&config)
+    info!("Download extension {:?} from uri {:?}", ext_path, uri);
+
+    let resp = reqwest::get(uri).await?;
+
+    match resp.status() {
+        StatusCode::OK => match resp.bytes().await {
+            Ok(resp) => {
+                info!("Download extension {:?} completed successfully", ext_path);
+                Ok(resp)
+            }
+            Err(e) => bail!("could not deserialize remote extension response: {}", e),
+        },
+        StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
+        _ => bail!(
+            "unexpected remote extension response status code: {}",
+            resp.status()
+        ),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::parse_pg_version;
+
+    #[test]
+    fn test_parse_pg_version() {
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), "v15");
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), "v15");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
+            "v15"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), "v14");
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), "v14");
+        assert_eq!(
+            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
+            "v14"
+        );
+
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), "v16");
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), "v16");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_unsupported_version() {
+        parse_pg_version("PostgreSQL 13.14");
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_parse_pg_incorrect_version_format() {
+        parse_pg_version("PostgreSQL 14");
+    }
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        // download extension files from S3 on demand
+        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());
@@ -227,7 +227,7 @@ async fn handle_configure_request(

        let parsed_spec = match ParsedSpec::try_from(spec) {
            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
+            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
        };

        // XXX: wrap state update under lock in code blocks. Otherwise,
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -156,17 +156,17 @@ paths:
                description: Error text or 'OK' if download succeeded.
                example: "OK"
        400:
-        description: Request is invalid.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Request is invalid.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
        500:
-        description: Extension download request failed.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Extension download request failed.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query(
-            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
-            &[],
-        )?
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
-            replication: Some(row.get("rolreplication")),
-            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane(
    spec
 }

-/// It takes cluster specification and does the following:
-/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
-/// - Update `pg_hba.conf` to allow external connections.
-pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
-    // File `postgresql.conf` is no longer included into `basebackup`, so just
-    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
-
-    update_pg_hba(pgdata_path)?;
-
-    Ok(())
-}
-
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    // XXX: consider making it a part of spec.json
@@ -265,8 +252,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
-                || !r.bypassrls.unwrap_or(false)
-                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -298,14 +283,22 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String =
-                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
+                // This can be run on /every/ role! Not just ones created through the console.
+                // This means that if you add some funny ALTER here that adds a permission,
+                // this will get run even on user-created roles! This will result in different
+                // behavior before and after a spec gets reapplied. The below ALTER as it stands
+                // now only grants LOGIN and changes the password. Please do not allow this branch
+                // to do anything silly.
+                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
+                // This branch only runs when roles are created through the console, so it is
+                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
+                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -674,3 +667,33 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>

    Ok(())
 }
+
+/// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
+#[instrument(skip_all)]
+pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
+    info!("handle extension neon");
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
+    client.simple_query(query)?;
+
+    query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
+    info!("create neon extension with query: {}", query);
+    client.simple_query(query)?;
+
+    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
+    client.simple_query(query)?;
+
+    query = "ALTER EXTENSION neon SET SCHEMA neon";
+    info!("alter neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    // this will be a no-op if extension is already up to date,
+    // which may happen in two cases:
+    // - extension was just installed
+    // - extension was already installed and is up to date
+    let query = "ALTER EXTENSION neon UPDATE";
+    info!("update neon extension schema with query: {}", query);
+    client.simple_query(query)?;
+
+    Ok(())
+}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,6 +9,7 @@ pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
+    client: reqwest::blocking::Client,
 }

 const COMMAND: &str = "attachment_service";
@@ -24,6 +25,16 @@ pub struct AttachHookResponse {
    pub gen: Option<u32>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct InspectRequest {
+    pub tenant_id: TenantId,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct InspectResponse {
+    pub attachment: Option<(u32, NodeId)>,
+}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = env.base_data_dir.join("attachments.json");
@@ -42,6 +53,9 @@ impl AttachmentService {
            env: env.clone(),
            path,
            listen,
+            client: reqwest::blocking::ClientBuilder::new()
+                .build()
+                .expect("Failed to construct http client"),
        }
    }

@@ -84,16 +98,13 @@ impl AttachmentService {
            .unwrap()
            .join("attach-hook")
            .unwrap();
-        let client = reqwest::blocking::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");

        let request = AttachHookRequest {
            tenant_id,
            node_id: Some(pageserver_id),
        };

-        let response = client.post(url).json(&request).send()?;
+        let response = self.client.post(url).json(&request).send()?;
        if response.status() != StatusCode::OK {
            return Err(anyhow!("Unexpected status {}", response.status()));
        }
@@ -101,4 +112,26 @@ impl AttachmentService {
        let response = response.json::<AttachHookResponse>()?;
        Ok(response.gen)
    }
+
+    pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
+        use hyper::StatusCode;
+
+        let url = self
+            .env
+            .control_plane_api
+            .clone()
+            .unwrap()
+            .join("inspect")
+            .unwrap();
+
+        let request = InspectRequest { tenant_id };
+
+        let response = self.client.post(url).json(&request).send()?;
+        if response.status() != StatusCode::OK {
+            return Err(anyhow!("Unexpected status {}", response.status()));
+        }
+
+        let response = response.json::<InspectResponse>()?;
+        Ok(response.attachment)
+    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY
+    // SAFETY:
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -9,6 +9,7 @@ use clap::Parser;
 use hex::FromHex;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response};
+use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
@@ -32,7 +33,9 @@ use pageserver_api::control_api::{
    ValidateResponseTenant,
 };

-use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
+use control_plane::attachment_service::{
+    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
+};

 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -171,7 +174,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
        if state.pageserver == Some(reattach_req.node_id) {
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
-                id: *t,
+                // TODO(sharding): make this shard-aware
+                id: TenantShardId::unsharded(*t),
                gen: state.generation,
            });
        }
@@ -194,8 +198,15 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
    };

    for req_tenant in validate_req.tenants {
-        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
+        // TODO(sharding): make this shard-aware
+        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
            let valid = tenant_state.generation == req_tenant.gen;
+            tracing::info!(
+                "handle_validate: {}(gen {}): valid={valid} (latest {})",
+                req_tenant.id,
+                req_tenant.gen,
+                tenant_state.generation
+            );
            response.tenants.push(ValidateResponseTenant {
                id: req_tenant.id,
                valid,
@@ -245,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;

+    tracing::info!(
+        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
+        attach_req.tenant_id,
+        tenant_state.generation,
+        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+    );
+
    locked.save().await.map_err(ApiError::InternalServerError)?;

    json_response(
@@ -255,12 +273,28 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    )
 }

+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req).inner.clone();
+    let locked = state.write().await;
+    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
+
+    json_response(
+        StatusCode::OK,
+        InspectResponse {
+            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
+        },
+    )
+}
+
 fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
    endpoint::make_router()
        .data(Arc::new(State::new(persistent_state)))
        .post("/re-attach", |r| request_span(r, handle_re_attach))
        .post("/validate", |r| request_span(r, handle_validate))
        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
 }

 #[tokio::main]
@@ -268,6 +302,7 @@ async fn main() -> anyhow::Result<()> {
    logging::init(
        LogFormat::Plain,
        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
    )?;

    let args = Cli::parse();
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -11,13 +11,14 @@ use compute_api::spec::ComputeMode;
 use control_plane::attachment_service::AttachmentService;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
-use control_plane::pageserver::PageServerNode;
+use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
+use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
 use pageserver_api::models::TimelineInfo;
 use pageserver_api::{
-    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
-    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
 use safekeeper_api::{
@@ -46,8 +47,8 @@ const DEFAULT_PG_VERSION: &str = "15";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";

-fn default_conf() -> String {
-    format!(
+fn default_conf(num_pageservers: u16) -> String {
+    let mut template = format!(
        r#"
 # Default built-in configuration, defined in main.rs
 control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
@@ -55,21 +56,33 @@ control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
 [broker]
 listen_addr = '{DEFAULT_BROKER_ADDR}'

-[[pageservers]]
-id = {DEFAULT_PAGESERVER_ID}
-listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
-listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-
 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}

 "#,
-        trust_auth = AuthType::Trust,
-    )
+    );
+
+    for i in 0..num_pageservers {
+        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+
+        template += &format!(
+            r#"
+[[pageservers]]
+id = {pageserver_id}
+listen_pg_addr = '127.0.0.1:{pg_port}'
+listen_http_addr = '127.0.0.1:{http_port}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'
+"#,
+            trust_auth = AuthType::Trust,
+        )
+    }
+
+    template
 }

 ///
@@ -155,7 +168,7 @@ fn print_timelines_tree(
                    info: t.clone(),
                    children: BTreeSet::new(),
                    name: timeline_name_mappings
-                        .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
+                        .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
                },
            )
        })
@@ -295,6 +308,9 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }

 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
+    let num_pageservers = init_match
+        .get_one::<u16>("num-pageservers")
+        .expect("num-pageservers arg has a default");
    // Create config file
    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
@@ -306,7 +322,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        })?
    } else {
        // Built-in default config
-        default_conf()
+        default_conf(*num_pageservers)
    };

    let pg_version = init_match
@@ -320,6 +336,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

+    // Create remote storage location for default LocalFs remote storage
+    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+
    // Initialize pageserver, create initial tenant and timeline.
    for ps_conf in &env.pageservers {
        PageServerNode::from_env(&env, ps_conf)
@@ -396,6 +415,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                None,
                None,
                Some(pg_version),
+                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;
@@ -433,6 +453,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            migrate_tenant(env, tenant_id, new_pageserver)?;
+            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
+        }
+
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -459,8 +488,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info =
-                pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
+            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+
+            let timeline_info = pageserver.timeline_create(
+                tenant_id,
+                new_timeline_id_opt,
+                None,
+                None,
+                Some(pg_version),
+                None,
+            )?;
            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
@@ -547,6 +584,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                start_lsn,
                Some(ancestor_timeline_id),
                None,
+                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;

@@ -573,11 +611,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    };
    let mut cplane = ComputeControlPlane::load(env.clone())?;

-    // All subcommands take an optional --tenant-id option
-    let tenant_id = get_tenant_id(sub_args, env)?;
-
    match sub_name {
        "list" => {
+            let tenant_id = get_tenant_id(sub_args, env)?;
            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
                eprintln!("Failed to load timeline info: {}", e);
                HashMap::new()
@@ -637,6 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            println!("{table}");
        }
        "create" => {
+            let tenant_id = get_tenant_id(sub_args, env)?;
            let branch_name = sub_args
                .get_one::<String>("branch-name")
                .map(|s| s.as_str())
@@ -681,6 +718,18 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };

+            match (mode, hot_standby) {
+                (ComputeMode::Static(_), true) => {
+                    bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
+                }
+                (ComputeMode::Primary, true) => {
+                    bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                }
+                _ => {}
+            }
+
+            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+
            cplane.new_endpoint(
                &endpoint_id,
                tenant_id,
@@ -693,8 +742,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            )?;
        }
        "start" => {
-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
@@ -723,80 +770,28 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    env.safekeepers.iter().map(|sk| sk.id).collect()
                };

-            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
+
+            cplane.check_conflicting_endpoints(
+                endpoint.mode,
+                endpoint.tenant_id,
+                endpoint.timeline_id,
+            )?;

            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
-                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
+                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
            } else {
                None
            };

-            let hot_standby = sub_args
-                .get_one::<bool>("hot-standby")
-                .copied()
-                .unwrap_or(false);
-
-            if let Some(endpoint) = endpoint {
-                match (&endpoint.mode, hot_standby) {
-                    (ComputeMode::Static(_), true) => {
-                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
-                    }
-                    (ComputeMode::Primary, true) => {
-                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
-                    }
-                    _ => {}
-                }
-                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
-            } else {
-                let branch_name = sub_args
-                    .get_one::<String>("branch-name")
-                    .map(|s| s.as_str())
-                    .unwrap_or(DEFAULT_BRANCH_NAME);
-                let timeline_id = env
-                    .get_branch_timeline_id(branch_name, tenant_id)
-                    .ok_or_else(|| {
-                        anyhow!("Found no timeline id for branch name '{branch_name}'")
-                    })?;
-                let lsn = sub_args
-                    .get_one::<String>("lsn")
-                    .map(|lsn_str| Lsn::from_str(lsn_str))
-                    .transpose()
-                    .context("Failed to parse Lsn from the request")?;
-                let pg_version = sub_args
-                    .get_one::<u32>("pg-version")
-                    .copied()
-                    .context("Failed to `pg-version` from the argument string")?;
-
-                let mode = match (lsn, hot_standby) {
-                    (Some(lsn), false) => ComputeMode::Static(lsn),
-                    (None, true) => ComputeMode::Replica,
-                    (None, false) => ComputeMode::Primary,
-                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
-                };
-
-                // when used with custom port this results in non obvious behaviour
-                // port is remembered from first start command, i e
-                // start --port X
-                // stop
-                // start <-- will also use port X even without explicit port argument
-                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
-
-                let ep = cplane.new_endpoint(
-                    endpoint_id,
-                    tenant_id,
-                    timeline_id,
-                    pg_port,
-                    http_port,
-                    pg_version,
-                    mode,
-                    pageserver_id,
-                )?;
-                ep.start(&auth_token, safekeepers, remote_ext_config)?;
-            }
+            println!("Starting existing endpoint {endpoint_id}...");
+            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
        }
        "reconfigure" => {
            let endpoint_id = sub_args
@@ -867,20 +862,20 @@ fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Res
    }
 }

+fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
+    let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
+        NodeId(id_str.parse().context("while parsing pageserver id")?)
+    } else {
+        DEFAULT_PAGESERVER_ID
+    };
+
+    Ok(PageServerNode::from_env(
+        env,
+        env.get_pageserver_conf(node_id)?,
+    ))
+}
+
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
-        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
-            NodeId(id_str.parse().context("while parsing pageserver id")?)
-        } else {
-            DEFAULT_PAGESERVER_ID
-        };
-
-        Ok(PageServerNode::from_env(
-            env,
-            env.get_pageserver_conf(node_id)?,
-        ))
-    }
-
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
            if let Err(e) = get_pageserver(env, subcommand_args)?
@@ -917,6 +912,20 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
            }
        }

+        Some(("migrate", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            //TODO what shutdown strategy should we use here?
+            if let Err(e) = pageserver.stop(false) {
+                eprintln!("pageserver stop failed: {}", e);
+                exit(1);
+            }
+
+            if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
+                eprintln!("pageserver start failed: {e}");
+                exit(1);
+            }
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status() {
                Ok(_) => println!("Page server is up and running"),
@@ -1203,7 +1212,7 @@ fn cli() -> Command {
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
-        .help("Configure the S3 bucket that we search for extensions in.")
+        .help("Configure the remote extensions storage proxy gateway to request for extensions.")
        .required(false);

    let lsn_arg = Arg::new("lsn")
@@ -1224,6 +1233,13 @@ fn cli() -> Command {
        .help("Force initialization even if the repository is not empty")
        .required(false);

+    let num_pageservers_arg = Arg::new("num-pageservers")
+        .value_parser(value_parser!(u16))
+        .long("num-pageservers")
+        .help("How many pageservers to create (default 1)")
+        .required(false)
+        .default_value("1");
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1231,6 +1247,7 @@ fn cli() -> Command {
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
                .arg(pageserver_config_args.clone())
+                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
@@ -1258,6 +1275,7 @@ fn cli() -> Command {
            .subcommand(Command::new("create")
                .about("Create a new blank timeline")
                .arg(tenant_id_arg.clone())
+                .arg(timeline_id_arg.clone())
                .arg(branch_name_arg.clone())
                .arg(pg_version_arg.clone())
            )
@@ -1301,6 +1319,10 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1375,15 +1397,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(timeline_id_arg.clone())
-                    .arg(lsn_arg)
-                    .arg(pg_port_arg)
-                    .arg(http_port_arg)
                    .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(pg_version_arg)
-                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
@@ -1396,7 +1410,6 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg.clone())
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -45,6 +45,7 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::spec::RemoteExtSpec;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -124,6 +125,7 @@ impl ComputeControlPlane {
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
        let pageserver =
            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
@@ -168,6 +170,30 @@ impl ComputeControlPlane {

        Ok(ep)
    }
+
+    pub fn check_conflicting_endpoints(
+        &self,
+        mode: ComputeMode,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<()> {
+        if matches!(mode, ComputeMode::Primary) {
+            // this check is not complete, as you could have a concurrent attempt at
+            // creating another primary, both reading the state before checking it here,
+            // but it's better than nothing.
+            let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
+                v.tenant_id == tenant_id
+                    && v.timeline_id == timeline_id
+                    && v.mode == mode
+                    && v.status() != "stopped"
+            });
+
+            if let Some((key, _)) = duplicates.next() {
+                bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
+            }
+        }
+        Ok(())
+    }
 }

 ///////////////////////////////////////////////////////////////////////////////
@@ -476,11 +502,24 @@ impl Endpoint {
            }
        }

+        // check for file remote_extensions_spec.json
+        // if it is present, read it and pass to compute_ctl
+        let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
+        let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
+        let remote_extensions: Option<RemoteExtSpec>;
+
+        if let Ok(spec_file) = remote_extensions_spec {
+            remote_extensions = serde_json::from_reader(spec_file).ok();
+        } else {
+            remote_extensions = None;
+        };
+
        // Create spec file
        let spec = ComputeSpec {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
+            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
@@ -497,7 +536,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            remote_extensions: None,
+            remote_extensions,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,11 +1,10 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod attachment_service;
 mod background_process;
@@ -15,3 +14,4 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
+pub mod tenant_migration;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,11 +11,15 @@ use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
+use std::time::Duration;
 use std::{io, result};

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
+};
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -31,6 +35,9 @@ use utils::{
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
+
 #[derive(Error, Debug)]
 pub enum PageserverHttpError {
    #[error("Reqwest error: {0}")]
@@ -98,8 +105,10 @@ impl PageServerNode {
        }
    }

-    // pageserver conf overrides defined by neon_local configuration.
-    fn neon_local_overrides(&self) -> Vec<String> {
+    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
+    ///
+    /// These all end up on the command line of the `pageserver` binary.
+    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
@@ -132,12 +141,25 @@ impl PageServerNode {
            ));
        }

+        if !cli_overrides
+            .iter()
+            .any(|c| c.starts_with("remote_storage"))
+        {
+            overrides.push(format!(
+                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
+            ));
+        }
+
        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }
+
+        // Apply the user-provided overrides
+        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
+
        overrides
    }

@@ -203,9 +225,6 @@ impl PageServerNode {
    }

    fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
-
        let datadir = self.repo_path();
        print!(
            "Starting pageserver node {} at '{}' in {:?}",
@@ -248,8 +267,7 @@ impl PageServerNode {
    ) -> Vec<Cow<'a, str>> {
        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];

-        let mut overrides = self.neon_local_overrides();
-        overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
+        let overrides = self.neon_local_overrides(config_overrides);
        for config_override in overrides {
            args.push(Cow::Borrowed("-c"));
            args.push(Cow::Owned(config_override));
@@ -389,10 +407,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'gc_feedback' as bool")?,
+            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
        };

        let request = models::TenantCreateRequest {
-            new_tenant_id,
+            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
            config,
        };
@@ -486,6 +505,7 @@ impl PageServerNode {
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'gc_feedback' as bool")?,
+                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
            }
        };

@@ -501,6 +521,32 @@ impl PageServerNode {
        Ok(())
    }

+    pub fn location_config(
+        &self,
+        tenant_id: TenantId,
+        config: LocationConfig,
+        flush_ms: Option<Duration>,
+    ) -> anyhow::Result<()> {
+        let req_body = TenantLocationConfigRequest { tenant_id, config };
+
+        let path = format!(
+            "{}/tenant/{}/location_config",
+            self.http_base_url, tenant_id
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+
+        self.http_request(Method::PUT, path)?
+            .json(&req_body)
+            .send()?
+            .error_from_body()?;
+
+        Ok(())
+    }
+
    pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
        let timeline_infos: Vec<TimelineInfo> = self
            .http_request(
@@ -521,6 +567,7 @@ impl PageServerNode {
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
+        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
        // If timeline ID was not specified, generate one
        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
@@ -534,6 +581,7 @@ impl PageServerNode {
            ancestor_start_lsn,
            ancestor_timeline_id,
            pg_version,
+            existing_initdb_timeline_id,
        })
        .send()?
        .error_from_body()?
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -0,0 +1,197 @@
+//!
+//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
+//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
+//! point to the new pageserver.
+//!
+use crate::local_env::LocalEnv;
+use crate::{
+    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
+    pageserver::PageServerNode,
+};
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use std::collections::HashMap;
+use std::time::Duration;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+/// Given an attached pageserver, retrieve the LSN for all timelines
+fn get_lsns(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+    let timelines = pageserver.timeline_list(&tenant_id)?;
+    Ok(timelines
+        .into_iter()
+        .map(|t| (t.timeline_id, t.last_record_lsn))
+        .collect())
+}
+
+/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
+/// `baseline`.
+fn await_lsn(
+    tenant_id: TenantId,
+    pageserver: &PageServerNode,
+    baseline: HashMap<TimelineId, Lsn>,
+) -> anyhow::Result<()> {
+    loop {
+        let latest = match get_lsns(tenant_id, pageserver) {
+            Ok(l) => l,
+            Err(e) => {
+                println!(
+                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    pageserver.conf.id
+                );
+                std::thread::sleep(Duration::from_millis(500));
+                continue;
+            }
+        };
+
+        let mut any_behind: bool = false;
+        for (timeline_id, baseline_lsn) in &baseline {
+            match latest.get(timeline_id) {
+                Some(latest_lsn) => {
+                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                    if latest_lsn < baseline_lsn {
+                        any_behind = true;
+                    }
+                }
+                None => {
+                    // Expected timeline isn't yet visible on migration destination.
+                    // (IRL we would have to account for timeline deletion, but this
+                    //  is just test helper)
+                    any_behind = true;
+                }
+            }
+        }
+
+        if !any_behind {
+            println!("✅ LSN caught up.  Proceeding...");
+            break;
+        } else {
+            std::thread::sleep(Duration::from_millis(500));
+        }
+    }
+
+    Ok(())
+}
+
+/// This function spans multiple services, to demonstrate live migration of a tenant
+/// between pageservers:
+///  - Coordinate attach/secondary/detach on pageservers
+///  - call into attachment_service for generations
+///  - reconfigure compute endpoints to point to new attached pageserver
+pub fn migrate_tenant(
+    env: &LocalEnv,
+    tenant_id: TenantId,
+    dest_ps: PageServerNode,
+) -> anyhow::Result<()> {
+    // Get a new generation
+    let attachment_service = AttachmentService::from_env(env);
+
+    fn build_location_config(
+        mode: LocationConfigMode,
+        generation: Option<u32>,
+        secondary_conf: Option<LocationConfigSecondary>,
+    ) -> LocationConfig {
+        LocationConfig {
+            mode,
+            generation,
+            secondary_conf,
+            tenant_conf: TenantConfig::default(),
+            shard_number: 0,
+            shard_count: 0,
+            shard_stripe_size: 0,
+        }
+    }
+
+    let previous = attachment_service.inspect(tenant_id)?;
+    let mut baseline_lsns = None;
+    if let Some((generation, origin_ps_id)) = &previous {
+        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
+
+        if origin_ps_id == &dest_ps.conf.id {
+            println!("🔁 Already attached to {origin_ps_id}, freshening...");
+            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
+            dest_ps.location_config(tenant_id, dest_conf, None)?;
+            println!("✅ Migration complete");
+            return Ok(());
+        }
+
+        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
+
+        let stale_conf =
+            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
+        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
+
+        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
+    }
+
+    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
+    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
+
+    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
+    dest_ps.location_config(tenant_id, dest_conf, None)?;
+
+    if let Some(baseline) = baseline_lsns {
+        println!("🕑 Waiting for LSN to catch up...");
+        await_lsn(tenant_id, &dest_ps, baseline)?;
+    }
+
+    let cplane = ComputeControlPlane::load(env.clone())?;
+    for (endpoint_name, endpoint) in &cplane.endpoints {
+        if endpoint.tenant_id == tenant_id {
+            println!(
+                "🔁 Reconfiguring endpoint {} to use pageserver {}",
+                endpoint_name, dest_ps.conf.id
+            );
+            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+        }
+    }
+
+    for other_ps_conf in &env.pageservers {
+        if other_ps_conf.id == dest_ps.conf.id {
+            continue;
+        }
+
+        let other_ps = PageServerNode::from_env(env, other_ps_conf);
+        let other_ps_tenants = other_ps.tenant_list()?;
+
+        // Check if this tenant is attached
+        let found = other_ps_tenants
+            .into_iter()
+            .map(|t| t.id)
+            .any(|i| i.tenant_id == tenant_id);
+        if !found {
+            continue;
+        }
+
+        // Downgrade to a secondary location
+        let secondary_conf = build_location_config(
+            LocationConfigMode::Secondary,
+            None,
+            Some(LocationConfigSecondary { warm: true }),
+        );
+
+        println!(
+            "💤 Switching to secondary mode on pageserver {}",
+            other_ps.conf.id
+        );
+        other_ps.location_config(tenant_id, secondary_conf, None)?;
+    }
+
+    println!(
+        "🔁 Switching to AttachedSingle mode on pageserver {}",
+        dest_ps.conf.id
+    );
+    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
+    dest_ps.location_config(tenant_id, dest_conf, None)?;
+
+    println!("✅ Migration complete");
+
+    Ok(())
+}
--- a/deny.toml
+++ b/deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []

+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -177,7 +177,7 @@ I e during migration create_branch can be called on old pageserver and newly cre

 The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.

-The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+The approach largely follows this guide: <https://www.notion.so/neondatabase/Cloud-Ad-hoc-tenant-relocation-f687474f7bfc42269e6214e3acba25c7>

 The happy path sequence:

--- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md
+++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
@@ -0,0 +1,205 @@
+# Name
+
+Created on: 2023-09-08
+Author: Arpad Müller
+
+## Summary
+
+Enable the pageserver to recover from data corruption events by implementing
+a feature to re-apply historic WAL records in parallel to the already occurring
+WAL replay.
+
+The feature is outside of the user-visible backup and history story, and only
+serves as a second-level backup for the case that there is a bug in the
+pageservers that corrupted the served pages.
+
+The RFC proposes the addition of two new features:
+* recover a broken branch from WAL (downtime is allowed)
+* a test recovery system to recover random branches to make sure recovery works
+
+## Motivation
+
+The historic WAL is currently stored in S3 even after it has been replayed by
+the pageserver and thus been integrated into the pageserver's storage system.
+This is done to defend from data corruption failures inside the pageservers.
+
+However, application of this WAL in the disaster recovery setting is currently
+very manual and we want to automate this to make it easier.
+
+### Use cases
+
+There are various use cases for this feature, like:
+
+* The main motivation is replaying in the instance of pageservers corrupting
+  data.
+* We might want to, beyond the user-visible history features, through our
+  support channels and upon customer request, in select instances, recover
+  historic versions beyond the range of history that we officially support.
+* Running the recovery process in the background for random tenant timelines
+  to figure out if there was a corruption of data (we would compare with what
+  the pageserver stores for the "official" timeline).
+* Using the WAL to arrive at historic pages we can then back up to S3 so that
+  WAL itself can be discarded, or at least not used for future replays.
+  Again, this sounds a lot like what the pageserver is already doing, but the
+  point is to provide a fallback to the service provided by the pageserver.
+
+## Design
+
+### Design constraints
+
+The main design constraint is that the feature needs to be *simple* enough that
+the number of bugs are as low, and reliability as high as possible: the main
+goal of this endeavour is to achieve higher correctness than the pageserver.
+
+For the background process, we cannot afford a downtime of the timeline that is
+being cloned, as we don't want to restrict ourselves to offline tenants only.
+In the scenario where we want to recover from disasters or roll back to a
+historic lsn through support staff, downtimes are more affordable, and
+inevitable if the original had been subject to the corruption. Ideally, the
+two code paths would share code, so the solution would be designed for not
+requiring downtimes.
+
+### API endpoint changes
+
+This RFC proposes two API endpoint changes in the safekeeper and the
+pageserver.
+
+Remember, the pageserver timeline API creation endpoint is to this URL:
+
+```
+/v1/tenant/{tenant_id}/timeline/
+```
+
+Where `{tenant_id}` is the ID of the tenant the timeline is created for,
+and specified as part of the URL. The timeline ID is passed via the POST
+request body as the only required parameter `new_timeline_id`.
+
+This proposal adds one optional parameter called
+`existing_initdb_timeline_id` to the request's json body. If the parameter
+is not specified, behaviour should be as existing, so the pageserver runs
+initdb.
+If the parameter is specified, it is expected to point to a timeline ID.
+In fact that ID might match `new_timeline_id`, what's important is that
+S3 storage contains a matching initdb under the URL matching the given
+tenant and timeline.
+
+Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
+specified is illegal and will yield in an HTTP error. This feature is
+only meant for the "main" branch that doesn't have any ancestors
+of its own, as only here initdb is relevant.
+
+For the safekeeper, we propose the addition of the following copy endpoint:
+
+```
+/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
+```
+it is meant for POST requests with json, and the two URL parameters
+`tenant_id` and `source_timeline_id`. The json request body contains
+the two required parameters `target_timeline_id` and `until_lsn`.
+
+After invoking, the copy endpoint starts a copy process of the WAL from
+the source ID to the target ID. The lsn is updated according to the
+progress of the API call.
+
+### Higher level features
+
+We want the API changes to support the following higher level features:
+
+* recovery-after-corruption DR of the main timeline of a tenant. This
+  feature allows for downtime.
+* test DR of the main timeline into a special copy timeline. this feature
+  is meant to run against selected production tenants in the background,
+  without the user noticing, so it does not allow for downtime.
+
+The recovery-after-corruption DR only needs the pageserver changes.
+It works as follows:
+
+* delete the timeline from the pageservers via timeline deletion API
+* re-create it via timeline creation API (same ID as before) and set
+  `existing_initdb_timeline_id` to the same timeline ID
+
+The test DR requires also the copy primitive and works as follows:
+
+* copy the WAL of the timeline to a new place
+* create a new timeline for the tenant
+
+## Non Goals
+
+At the danger of being repetitive, the main goal of this feature is to be a
+backup method, so reliability is very important. This implies that other
+aspects like performance or space reduction are less important.
+
+### Corrupt WAL
+
+The process suggested by this RFC assumes that the WAL is free of corruption.
+In some instances, corruption can make it into WAL, like for example when
+higher level components like postgres or the application first read corrupt
+data, and then execute a write with data derived from that earlier read. That
+written data might then contain the corruption.
+
+Common use cases can hit this quite easily. For example, an application reads
+some counter, increments it, and then writes the new counter value to the
+database.
+On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
+which have corrupt data for rows unrelated to the write operation at hand.
+
+Separating corrupt writes from non-corrupt ones is a hard problem in general,
+and if the application was involved in making the corrupt write, a recovery
+would also involve the application. Therefore, corruption that has made it into
+the WAL is outside of the scope of this feature. However, the WAL replay can be
+issued to right before the point in time where the corruption occured. Then the
+data loss is isolated to post-corruption writes only.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Most changes would happen to the pageservers.
+For the higher level features, maybe other components like the console would
+be involved.
+
+We need to make sure that the shadow timelines are not subject to the usual
+limits and billing we apply to existing timelines.
+
+## Proposed implementation
+
+The first problem to keep in mind is the reproducability of `initdb`.
+So an initial step would be to upload `initdb` snapshots to S3.
+
+After that, we'd have the endpoint spawn a background process which
+performs the replay of the WAL to that new timeline. This process should
+follow the existing workflows as closely as possible, just using the
+WAL records of a different timeline.
+
+The timeline created will be in a special state that solely looks for WAL
+entries of the timeline it is trying to copy. Once the target LSN is reached,
+it turns into a normal timeline that also accepts writes to its own
+timeline ID.
+
+### Scalability
+
+For now we want to run this entire process on a single node, and as
+it is by nature linear, it's hard to parallelize. However, for the
+verification workloads, we can easily start the WAL replay in parallel
+for different points in time. This is valuable especially for tenants
+with large WAL records.
+
+Compare this with the tricks to make addition circuits execute with
+lower latency by making them perform the addition for both possible
+values of the carry bit, and then, in a second step, taking the
+result for the carry bit that was actually obtained.
+
+The other scalability dimension to consider is the WAL length, which
+is a growing question as tenants accumulate changes. There are
+possible approaches to this, including creating snapshots of the
+page files and uploading them to S3, but if we do this for every single
+branch, we lose the cheap branching property.
+
+### Implementation by component
+
+The proposed changes for the various components of the neon architecture
+are written up in this notion page:
+
+https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
+
+### Unresolved questions
+
+none known (outside of the mentioned ones).
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -26,6 +26,13 @@ pub struct ComputeSpec {
    // but we don't use it for anything. Serde will ignore missing fields when
    // deserializing it.
    pub operation_uuid: Option<String>,
+
+    /// Compute features to enable. These feature flags are provided, when we
+    /// know all the details about client's compute, so they cannot be used
+    /// to change `Empty` compute behavior.
+    #[serde(default)]
+    pub features: Vec<ComputeFeature>,
+
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -68,6 +75,19 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeFeature {
+    // XXX: Add more feature flags here.
+
+    // This is a special feature flag that is used to represent unknown feature flags.
+    // Basically all unknown to enum flags are represented as this one. See unit test
+    // `parse_unknown_features()` for more details.
+    #[serde(other)]
+    UnknownFeature,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -187,8 +207,6 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
-    pub replication: Option<bool>,
-    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }

@@ -229,7 +247,10 @@ mod tests {
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        // Features list defaults to empty vector.
+        assert!(spec.features.is_empty());
    }

    #[test]
@@ -241,4 +262,22 @@ mod tests {
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
+
+    #[test]
+    fn parse_unknown_features() {
+        // Test that unknown feature flags do not cause any errors.
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
+        let ob = json.as_object_mut().unwrap();
+
+        // Add unknown feature flags.
+        let features = vec!["foo_bar_feature", "baz_feature"];
+        ob.insert("features".into(), features.into());
+
+        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
+
+        assert!(spec.features.len() == 2);
+        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
+        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
+    }
 }
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,8 +2,12 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 use once_cell::sync::Lazy;
-use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
+use prometheus::core::{
+    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
+};
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
@@ -131,3 +135,137 @@ fn get_rusage_stats() -> libc::rusage {
        rusage.assume_init()
    }
 }
+
+/// Create an [`IntCounterPairVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_int_counter_pair_vec {
+    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
+        match (
+            $crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
+            $crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
+        ) {
+            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
+            (Err(e), _) | (_, Err(e)) => Err(e),
+        }
+    }};
+}
+/// Create an [`IntCounterPair`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_int_counter_pair {
+    ($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
+        match (
+            $crate::register_int_counter!($NAME1, $HELP1),
+            $crate::register_int_counter!($NAME2, $HELP2),
+        ) {
+            (Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
+            (Err(e), _) | (_, Err(e)) => Err(e),
+        }
+    }};
+}
+
+/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
+pub struct GenericCounterPairVec<P: Atomic> {
+    inc: GenericCounterVec<P>,
+    dec: GenericCounterVec<P>,
+}
+
+/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
+pub struct GenericCounterPair<P: Atomic> {
+    inc: GenericCounter<P>,
+    dec: GenericCounter<P>,
+}
+
+impl<P: Atomic> GenericCounterPairVec<P> {
+    pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
+        Self { inc, dec }
+    }
+
+    /// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
+        Ok(GenericCounterPair {
+            inc: self.inc.get_metric_with_label_values(vals)?,
+            dec: self.dec.get_metric_with_label_values(vals)?,
+        })
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<P: Atomic> GenericCounterPair<P> {
+    pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
+        Self { inc, dec }
+    }
+
+    /// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
+    pub fn guard(&self) -> GenericCounterPairGuard<P> {
+        self.inc.inc();
+        GenericCounterPairGuard(self.dec.clone())
+    }
+
+    /// Increment the gauge by n, returning a guard that decrements by n on drop.
+    pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
+        self.inc.inc_by(n);
+        GenericCounterPairGuardBy(self.dec.clone(), n)
+    }
+
+    /// Increase the gauge by 1.
+    #[inline]
+    pub fn inc(&self) {
+        self.inc.inc();
+    }
+
+    /// Decrease the gauge by 1.
+    #[inline]
+    pub fn dec(&self) {
+        self.dec.inc();
+    }
+
+    /// Add the given value to the gauge. (The value can be
+    /// negative, resulting in a decrement of the gauge.)
+    #[inline]
+    pub fn inc_by(&self, v: P::T) {
+        self.inc.inc_by(v);
+    }
+
+    /// Subtract the given value from the gauge. (The value can be
+    /// negative, resulting in an increment of the gauge.)
+    #[inline]
+    pub fn dec_by(&self, v: P::T) {
+        self.dec.inc_by(v);
+    }
+}
+
+/// Guard returned by [`GenericCounterPair::guard`]
+pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
+
+impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
+    fn drop(&mut self) {
+        self.0.inc();
+    }
+}
+/// Guard returned by [`GenericCounterPair::guard_by`]
+pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
+
+impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
+    fn drop(&mut self) {
+        self.0.inc_by(self.1);
+    }
+}
+
+/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
+pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
+
+/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
+pub type IntCounterPair = GenericCounterPair<AtomicU64>;
+
+/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
+pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -17,5 +17,10 @@ postgres_ffi.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+hex.workspace = true
+thiserror.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+bincode.workspace = true
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,7 +4,9 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;
+
+use crate::shard::TenantShardId;

 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
@@ -13,7 +15,7 @@ pub struct ReAttachRequest {

 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub gen: u32,
 }

@@ -24,7 +26,7 @@ pub struct ReAttachResponse {

 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub gen: u32,
 }

@@ -40,6 +42,6 @@ pub struct ValidateResponse {

 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -0,0 +1,146 @@
+use anyhow::{bail, Result};
+use byteorder::{ByteOrder, BE};
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// Key used in the Repository kv-store.
+///
+/// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
+/// for what we actually store in these fields.
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+pub struct Key {
+    pub field1: u8,
+    pub field2: u32,
+    pub field3: u32,
+    pub field4: u32,
+    pub field5: u8,
+    pub field6: u32,
+}
+
+pub const KEY_SIZE: usize = 18;
+
+impl Key {
+    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
+    /// As long as Neon does not support tablespace (because of lack of access to local file system),
+    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
+    pub fn to_i128(&self) -> i128 {
+        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
+        (((self.field1 & 0xf) as i128) << 120)
+            | (((self.field2 & 0xFFFF) as i128) << 104)
+            | ((self.field3 as i128) << 72)
+            | ((self.field4 as i128) << 40)
+            | ((self.field5 as i128) << 32)
+            | self.field6 as i128
+    }
+
+    pub const fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
+    pub fn next(&self) -> Key {
+        self.add(1)
+    }
+
+    pub fn add(&self, x: u32) -> Key {
+        let mut key = *self;
+
+        let r = key.field6.overflowing_add(x);
+        key.field6 = r.0;
+        if r.1 {
+            let r = key.field5.overflowing_add(1);
+            key.field5 = r.0;
+            if r.1 {
+                let r = key.field4.overflowing_add(1);
+                key.field4 = r.0;
+                if r.1 {
+                    let r = key.field3.overflowing_add(1);
+                    key.field3 = r.0;
+                    if r.1 {
+                        let r = key.field2.overflowing_add(1);
+                        key.field2 = r.0;
+                        if r.1 {
+                            let r = key.field1.overflowing_add(1);
+                            key.field1 = r.0;
+                            assert!(!r.1);
+                        }
+                    }
+                }
+            }
+        }
+        key
+    }
+
+    pub fn from_slice(b: &[u8]) -> Self {
+        Key {
+            field1: b[0],
+            field2: u32::from_be_bytes(b[1..5].try_into().unwrap()),
+            field3: u32::from_be_bytes(b[5..9].try_into().unwrap()),
+            field4: u32::from_be_bytes(b[9..13].try_into().unwrap()),
+            field5: b[13],
+            field6: u32::from_be_bytes(b[14..18].try_into().unwrap()),
+        }
+    }
+
+    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
+        buf[0] = self.field1;
+        BE::write_u32(&mut buf[1..5], self.field2);
+        BE::write_u32(&mut buf[5..9], self.field3);
+        BE::write_u32(&mut buf[9..13], self.field4);
+        buf[13] = self.field5;
+        BE::write_u32(&mut buf[14..18], self.field6);
+    }
+}
+
+impl fmt::Display for Key {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}",
+            self.field1, self.field2, self.field3, self.field4, self.field5, self.field6
+        )
+    }
+}
+
+impl Key {
+    pub const MIN: Key = Key {
+        field1: u8::MIN,
+        field2: u32::MIN,
+        field3: u32::MIN,
+        field4: u32::MIN,
+        field5: u8::MIN,
+        field6: u32::MIN,
+    };
+    pub const MAX: Key = Key {
+        field1: u8::MAX,
+        field2: u32::MAX,
+        field3: u32::MAX,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    };
+
+    pub fn from_hex(s: &str) -> Result<Self> {
+        if s.len() != 36 {
+            bail!("parse error");
+        }
+        Ok(Key {
+            field1: u8::from_str_radix(&s[0..2], 16)?,
+            field2: u32::from_str_radix(&s[2..10], 16)?,
+            field3: u32::from_str_radix(&s[10..18], 16)?,
+            field4: u32::from_str_radix(&s[18..26], 16)?,
+            field5: u8::from_str_radix(&s[26..28], 16)?,
+            field6: u32::from_str_radix(&s[28..36], 16)?,
+        })
+    }
+}
+
+pub fn is_rel_block_key(key: &Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0
+}
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,9 +1,13 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
 pub mod control_api;
+pub mod key;
 pub mod models;
 pub mod reltag;
+pub mod shard;

 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,13 +10,12 @@ use serde_with::serde_as;
 use strum_macros;
 use utils::{
    completion,
-    generation::Generation,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
 };

-use crate::reltag::RelTag;
+use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

@@ -180,6 +179,8 @@ pub struct TimelineCreateRequest {
    #[serde(default)]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
+    pub existing_initdb_timeline_id: Option<TimelineId>,
+    #[serde(default)]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }
@@ -187,7 +188,7 @@ pub struct TimelineCreateRequest {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    pub new_tenant_id: TenantId,
+    pub new_tenant_id: TenantShardId,
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generation: Option<u32>,
@@ -236,6 +237,7 @@ pub struct TenantConfig {
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
+    pub heatmap_period: Option<String>,
 }

 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
@@ -262,10 +264,19 @@ pub struct LocationConfig {
    pub mode: LocationConfigMode,
    /// If attaching, in what generation?
    #[serde(default)]
-    pub generation: Option<Generation>,
+    pub generation: Option<u32>,
    #[serde(default)]
    pub secondary_conf: Option<LocationConfigSecondary>,

+    // Shard parameters: if shard_count is nonzero, then other shard_* fields
+    // must be set accurately.
+    #[serde(default)]
+    pub shard_number: u8,
+    #[serde(default)]
+    pub shard_count: u8,
+    #[serde(default)]
+    pub shard_stripe_size: u32,
+
    // If requesting mode `Secondary`, configuration for that.
    // Custom storage configuration for the tenant, if any
    pub tenant_conf: TenantConfig,
@@ -306,31 +317,14 @@ impl std::ops::Deref for TenantConfigRequest {

 impl TenantConfigRequest {
    pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
-        let config = TenantConfig {
-            checkpoint_distance: None,
-            checkpoint_timeout: None,
-            compaction_target_size: None,
-            compaction_period: None,
-            compaction_threshold: None,
-            gc_horizon: None,
-            gc_period: None,
-            image_creation_threshold: None,
-            pitr_interval: None,
-            walreceiver_connect_timeout: None,
-            lagging_wal_timeout: None,
-            max_lsn_wal_lag: None,
-            trace_read_requests: None,
-            eviction_policy: None,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: None,
-            gc_feedback: None,
-        };
+        let config = TenantConfig::default();
        TenantConfigRequest { tenant_id, config }
    }
 }

 #[derive(Debug, Deserialize)]
 pub struct TenantAttachRequest {
+    #[serde(default)]
    pub config: TenantAttachConfig,
    #[serde(default)]
    pub generation: Option<u32>,
@@ -338,7 +332,7 @@ pub struct TenantAttachRequest {

 /// Newtype to enforce deny_unknown_fields on TenantConfig for
 /// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Default)]
 #[serde(deny_unknown_fields)]
 pub struct TenantAttachConfig {
    #[serde(flatten)]
@@ -364,7 +358,7 @@ pub enum TenantAttachmentStatus {

 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    pub id: TenantId,
+    pub id: TenantShardId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
    /// Sum of the size of all layer files.
@@ -376,7 +370,7 @@ pub struct TenantInfo {
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    pub tenant_id: TenantId,
+    pub tenant_id: TenantShardId,
    pub timeline_id: TimelineId,

    pub ancestor_timeline_id: Option<TimelineId>,
@@ -392,7 +386,12 @@ pub struct TimelineInfo {
    /// The LSN that we are advertizing to safekeepers
    pub remote_consistent_lsn_visible: Lsn,

-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    /// The LSN from the start of the root timeline (never changes)
+    pub initdb_lsn: Lsn,
+
+    pub current_logical_size: u64,
+    pub current_logical_size_is_accurate: bool,
+
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
@@ -828,7 +827,7 @@ mod tests {
    fn test_tenantinfo_serde() {
        // Test serialization/deserialization of TenantInfo
        let original_active = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
@@ -845,7 +844,7 @@ mod tests {
        });

        let original_broken = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
            state: TenantState::Broken {
                reason: "reason".into(),
                backtrace: "backtrace info".into(),
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -0,0 +1,762 @@
+use std::{ops::RangeInclusive, str::FromStr};
+
+use crate::key::{is_rel_block_key, Key};
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+use thiserror;
+use utils::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(pub u8);
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+}
+
+/// Formatting helper
+struct ShardSlug<'a>(&'a TenantShardId);
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+/// For use within the context of a particular tenant, when we need to know which
+/// shard we're dealing with, but do not need to know the full ShardIdentity (because
+/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
+/// TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+/// Stripe size in number of pages
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardStripeSize(pub u32);
+
+/// Layout version: for future upgrades where we might change how the key->shard mapping works
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardLayout(u8);
+
+const LAYOUT_V1: ShardLayout = ShardLayout(1);
+/// ShardIdentity uses a magic layout value to indicate if it is unusable
+const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
+
+/// Default stripe size in pages: 256MiB divided by 8kiB page size.
+const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+
+/// The ShardIdentity contains the information needed for one member of map
+/// to resolve a key to a shard, and then check whether that shard is ==self.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
+#[derive(thiserror::Error, Debug, PartialEq, Eq)]
+pub enum ShardConfigError {
+    #[error("Invalid shard count")]
+    InvalidCount,
+    #[error("Invalid shard number")]
+    InvalidNumber,
+    #[error("Invalid stripe size")]
+    InvalidStripeSize,
+}
+
+impl ShardIdentity {
+    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
+    /// tenants.  Modern single-shard tenants should not use this: they should
+    /// have number=0 count=1.
+    pub fn unsharded() -> Self {
+        Self {
+            number: ShardNumber(0),
+            count: ShardCount(0),
+            layout: LAYOUT_V1,
+            stripe_size: DEFAULT_STRIPE_SIZE,
+        }
+    }
+
+    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
+    /// which are constructed in code paths that don't have access to proper configuration.
+    ///
+    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
+    /// Enforcement is via assertions, to avoid making our interface fallible for this
+    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
+    /// state, and by extension to avoid trying to do any page->shard resolution.
+    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            number,
+            count,
+            layout: LAYOUT_BROKEN,
+            stripe_size: DEFAULT_STRIPE_SIZE,
+        }
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.number == ShardNumber(0) && self.count == ShardCount(0)
+    }
+
+    /// Count must be nonzero, and number must be < count. To construct
+    /// the legacy case (count==0), use Self::unsharded instead.
+    pub fn new(
+        number: ShardNumber,
+        count: ShardCount,
+        stripe_size: ShardStripeSize,
+    ) -> Result<Self, ShardConfigError> {
+        if count.0 == 0 {
+            Err(ShardConfigError::InvalidCount)
+        } else if number.0 > count.0 - 1 {
+            Err(ShardConfigError::InvalidNumber)
+        } else if stripe_size.0 == 0 {
+            Err(ShardConfigError::InvalidStripeSize)
+        } else {
+            Ok(Self {
+                number,
+                count,
+                layout: LAYOUT_V1,
+                stripe_size,
+            })
+        }
+    }
+
+    fn is_broken(&self) -> bool {
+        self.layout == LAYOUT_BROKEN
+    }
+
+    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
+        assert!(!self.is_broken());
+        key_to_shard_number(self.count, self.stripe_size, key)
+    }
+
+    /// Return true if the key should be ingested by this shard
+    pub fn is_key_local(&self, key: &Key) -> bool {
+        assert!(!self.is_broken());
+        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
+            true
+        } else {
+            key_to_shard_number(self.count, self.stripe_size, key) == self.number
+        }
+    }
+
+    pub fn shard_slug(&self) -> String {
+        if self.count > ShardCount(0) {
+            format!("-{:02x}{:02x}", self.number.0, self.count.0)
+        } else {
+            String::new()
+        }
+    }
+
+    /// Convenience for checking if this identity is the 0th shard in a tenant,
+    /// for special cases on shard 0 such as ingesting relation sizes.
+    pub fn is_zero(&self) -> bool {
+        self.number == ShardNumber(0)
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
+/// in order to be able to serve basebackup requests without peer communication).
+fn key_is_shard0(key: &Key) -> bool {
+    // To decide what to shard out to shards >0, we apply a simple rule that only
+    // relation pages are distributed to shards other than shard zero. Everything else gets
+    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
+    // requests, and any request other than those for particular blocks in relations.
+    //
+    // In this condition:
+    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
+    // all metadata.
+    // - field6 is set to -1 for relation size pages.
+    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn hash_combine(mut a: u32, mut b: u32) -> u32 {
+    b = b.wrapping_add(0x9e3779b9);
+    b = b.wrapping_add(a << 6);
+    b = b.wrapping_add(a >> 2);
+
+    a ^= b;
+    a
+}
+
+/// Where a Key is to be distributed across shards, select the shard.  This function
+/// does not account for keys that should be broadcast across shards.
+///
+/// The hashing in this function must exactly match what we do in postgres smgr
+/// code.  The resulting distribution of pages is intended to preserve locality within
+/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
+/// distributing data pseudo-randomly.
+///
+/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
+/// and will be handled at higher levels when shards are split.
+fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
+    // Fast path for un-sharded tenants or broadcast keys
+    if count < ShardCount(2) || key_is_shard0(key) {
+        return ShardNumber(0);
+    }
+
+    // relNode
+    let mut hash = murmurhash32(key.field4);
+    // blockNum/stripe size
+    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
+
+    ShardNumber((hash % count.0 as u32) as u8)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use bincode;
+    use utils::{id::TenantId, Hex};
+
+    use super::*;
+
+    const EXAMPLE_TENANT_ID: &str = "1f359dd625e519a1a4e8d7509690f6fc";
+
+    #[test]
+    fn tenant_shard_id_string() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = format!("{example}");
+
+        let expected = format!("{EXAMPLE_TENANT_ID}-070a");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_binary() -> Result<(), hex::FromHexError> {
+        let example = TenantShardId {
+            tenant_id: TenantId::from_str(EXAMPLE_TENANT_ID).unwrap(),
+            shard_count: ShardCount(10),
+            shard_number: ShardNumber(7),
+        };
+
+        let encoded = bincode::serialize(&example).unwrap();
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x07, 0x0a,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize(&encoded).unwrap();
+
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_backward_compat() -> Result<(), hex::FromHexError> {
+        // Test that TenantShardId can decode a TenantId in human
+        // readable form
+        let example = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantShardId::from_str(&encoded)?;
+
+        assert_eq!(example, decoded.tenant_id);
+        assert_eq!(decoded.shard_count, ShardCount(0));
+        assert_eq!(decoded.shard_number, ShardNumber(0));
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_forward_compat() -> Result<(), hex::FromHexError> {
+        // Test that a legacy TenantShardId encodes into a form that
+        // can be decoded as TenantId
+        let example_tenant_id = TenantId::from_str(EXAMPLE_TENANT_ID).unwrap();
+        let example = TenantShardId::unsharded(example_tenant_id);
+        let encoded = format!("{example}");
+
+        assert_eq!(&encoded, EXAMPLE_TENANT_ID);
+
+        let decoded = TenantId::from_str(&encoded)?;
+
+        assert_eq!(example_tenant_id, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn tenant_shard_id_legacy_binary() -> Result<(), hex::FromHexError> {
+        // Unlike in human readable encoding, binary encoding does not
+        // do any special handling of legacy unsharded TenantIds: this test
+        // is equivalent to the main test for binary encoding, just verifying
+        // that the same behavior applies when we have used `unsharded()` to
+        // construct a TenantShardId.
+        let example = TenantShardId::unsharded(TenantId::from_str(EXAMPLE_TENANT_ID).unwrap());
+        let encoded = bincode::serialize(&example).unwrap();
+
+        let expected: [u8; 18] = [
+            0x1f, 0x35, 0x9d, 0xd6, 0x25, 0xe5, 0x19, 0xa1, 0xa4, 0xe8, 0xd7, 0x50, 0x96, 0x90,
+            0xf6, 0xfc, 0x00, 0x00,
+        ];
+        assert_eq!(Hex(&encoded), Hex(&expected));
+
+        let decoded = bincode::deserialize::<TenantShardId>(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    #[test]
+    fn shard_identity_validation() -> Result<(), ShardConfigError> {
+        // Happy cases
+        ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
+        ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
+        ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
+
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidCount)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
+            Err(ShardConfigError::InvalidStripeSize)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
+        let example = ShardIndex {
+            shard_number: ShardNumber(13),
+            shard_count: ShardCount(17),
+        };
+        let expected: String = "0d11".to_string();
+        let encoded = format!("{example}");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = ShardIndex::from_str(&encoded)?;
+        assert_eq!(example, decoded);
+        Ok(())
+    }
+
+    #[test]
+    fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
+        let example = ShardIndex {
+            shard_number: ShardNumber(13),
+            shard_count: ShardCount(17),
+        };
+        let expected: [u8; 2] = [0x0d, 0x11];
+
+        let encoded = bincode::serialize(&example).unwrap();
+        assert_eq!(Hex(&encoded), Hex(&expected));
+        let decoded = bincode::deserialize(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    // These are only smoke tests to spot check that our implementation doesn't
+    // deviate from a few examples values: not aiming to validate the overall
+    // hashing algorithm.
+    #[test]
+    fn murmur_hash() {
+        assert_eq!(murmurhash32(0), 0);
+
+        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
+    }
+
+    #[test]
+    fn shard_mapping() {
+        let key = Key {
+            field1: 0x00,
+            field2: 0x67f,
+            field3: 0x5,
+            field4: 0x400c,
+            field5: 0x00,
+            field6: 0x7d06,
+        };
+
+        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
+        assert_eq!(shard, ShardNumber(8));
+    }
+}
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -15,7 +17,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace};
+use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -33,6 +35,11 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
+    /// Authentication failure
+    #[error("Unauthorized: {0}")]
+    Unauthorized(std::borrow::Cow<'static, str>),
+    #[error("Simulated Connection Error")]
+    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -608,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
+                            &short_error(&e),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -728,12 +736,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
+                    match e {
+                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        QueryError::SimulatedConnectionError => {
+                            return Err(QueryError::SimulatedConnectionError)
+                        }
+                        e => {
+                            log_query_error(query_string, &e);
+                            let short_error = short_error(&e);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &short_error,
+                                Some(e.pg_error_code()),
+                            ))?;
+                        }
+                    }
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
@@ -959,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
+        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -975,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::SimulatedConnectionError => {
+            error!("query handler for query '{query}' failed due to a simulated connection error")
+        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
+        QueryError::Unauthorized(e) => {
+            warn!("query handler for '{query}' failed with authentication error: {e}");
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

@@ -288,10 +289,10 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
        // which is less readable
        #[allow(clippy::manual_range_contains)]
-        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
                len
@@ -974,4 +975,10 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
+
+    #[test]
+    fn parse_fe_startup_packet_regression() {
+        let data = [0, 0, 0, 7, 0, 0, 0, 0];
+        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
+    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,18 +8,19 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
-aws-smithy-http.workspace = true
-aws-types.workspace = true
+aws-smithy-async.workspace = true
+aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
+futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
-tokio-util.workspace = true
+tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,24 +1,24 @@
 //! Azure Blob Storage wrapper

+use std::borrow::Cow;
+use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
+use std::pin::Pin;
 use std::sync::Arc;
-use std::{borrow::Cow, collections::HashMap, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
-use azure_core::Header;
+use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{
-    blob::operations::GetBlobBuilder,
-    prelude::{BlobClient, ContainerClient},
-};
+use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use bytes::Bytes;
+use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::StatusCode;
-use tokio::io::AsyncRead;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -52,7 +52,8 @@ impl AzureBlobStorage {
            StorageCredentials::token_credential(Arc::new(token_credential))
        };

-        let builder = ClientBuilder::new(account, credentials);
+        // we have an outer retry
+        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());

        let client = builder.container_client(azure_config.container_name.to_owned());

@@ -112,47 +113,32 @@ impl AzureBlobStorage {

    async fn download_for_builder(
        &self,
-        metadata: StorageMetadata,
        builder: GetBlobBuilder,
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

+        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::new();
+
+        let mut bufs = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
+            if let Some(blob_meta) = part.blob.metadata {
+                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+            }
            let data = part
                .data
                .collect()
                .await
                .map_err(|e| DownloadError::Other(e.into()))?;
-            buf.extend_from_slice(&data.slice(..));
+            bufs.push(data);
        }
        Ok(Download {
-            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(metadata),
+            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+            metadata: Some(StorageMetadata(metadata)),
        })
    }
-    // TODO get rid of this function once we have metadata included in the response
-    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
-    async fn get_metadata(
-        &self,
-        blob_client: &BlobClient,
-    ) -> Result<StorageMetadata, DownloadError> {
-        let builder = blob_client.get_metadata();
-
-        let response = builder.into_future().await.map_err(to_download_error)?;
-        let mut map = HashMap::new();
-
-        for md in response.metadata.iter() {
-            map.insert(
-                md.name().as_str().to_string(),
-                md.value().as_str().to_string(),
-            );
-        }
-        Ok(StorageMetadata(map))
-    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
        self.concurrency_limiter
@@ -236,9 +222,10 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(res)
    }
+
    async fn upload(
        &self,
-        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -246,13 +233,12 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Put).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(to));

-        // TODO FIX THIS UGLY HACK and don't buffer the entire object
-        // into RAM here, but use the streaming interface. For that,
-        // we'd have to change the interface though...
-        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::with_capacity(data_size_bytes);
-        tokio::io::copy(&mut from, &mut buf).await?;
-        let body = azure_core::Body::Bytes(buf.into());
+        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+            Box::pin(from);
+
+        let from = NonSeekableStream::new(from, data_size_bytes);
+
+        let body = azure_core::Body::SeekableStream(Box::new(from));

        let mut builder = blob_client.put_block_blob(body);

@@ -269,11 +255,9 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let builder = blob_client.get();

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn download_byte_range(
@@ -285,23 +269,16 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

-        let metadata = self.get_metadata(&blob_client).await?;
-
        let mut builder = blob_client.get();

-        if let Some(end_exclusive) = end_exclusive {
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+        let range: Range = if let Some(end_exclusive) = end_exclusive {
+            (start_inclusive..end_exclusive).into()
        } else {
-            // Open ranges are not supported by the SDK so we work around
-            // by setting the upper limit extremely high (but high enough
-            // to still be representable by signed 64 bit integers).
-            // TODO remove workaround once the SDK adds open range support
-            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
-            let end_exclusive = u64::MAX / 4;
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
-        }
+            (start_inclusive..).into()
+        };
+        builder = builder.range(range);

-        self.download_for_builder(metadata, builder).await
+        self.download_for_builder(builder).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
@@ -335,3 +312,153 @@ impl RemoteStorage for AzureBlobStorage {
        Ok(())
    }
 }
+
+pin_project_lite::pin_project! {
+    /// Hack to work around not being able to stream once with azure sdk.
+    ///
+    /// Azure sdk clones streams around with the assumption that they are like
+    /// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
+    /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
+    /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
+    /// seekable, but we can also just re-try the request easier.
+    #[project = NonSeekableStreamProj]
+    enum NonSeekableStream<S> {
+        /// A stream wrappers initial form.
+        ///
+        /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
+        /// clone before first request, then this must be changed.
+        Initial {
+            inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
+            len: usize,
+        },
+        /// The actually readable variant, produced by cloning the Initial variant.
+        ///
+        /// The sdk currently always clones once, even without retry policy.
+        Actual {
+            #[pin]
+            inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
+            len: usize,
+            read_any: bool,
+        },
+        /// Most likely unneeded, but left to make life easier, in case more clones are added.
+        Cloned {
+            len_was: usize,
+        }
+    }
+}
+
+impl<S> NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+{
+    fn new(inner: S, len: usize) -> NonSeekableStream<S> {
+        use tokio_util::compat::TokioAsyncReadCompatExt;
+
+        let inner = tokio_util::io::StreamReader::new(inner).compat();
+        let inner = Some(inner);
+        let inner = std::sync::Mutex::new(inner);
+        NonSeekableStream::Initial { inner, len }
+    }
+}
+
+impl<S> std::fmt::Debug for NonSeekableStream<S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
+            Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
+            Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
+        }
+    }
+}
+
+impl<S> futures::io::AsyncRead for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>>,
+{
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut [u8],
+    ) -> std::task::Poll<std::io::Result<usize>> {
+        match self.project() {
+            NonSeekableStreamProj::Actual {
+                inner, read_any, ..
+            } => {
+                *read_any = true;
+                inner.poll_read(cx, buf)
+            }
+            // NonSeekableStream::Initial does not support reading because it is just much easier
+            // to have the mutex in place where one does not poll the contents, or that's how it
+            // seemed originally. If there is a version upgrade which changes the cloning, then
+            // that support needs to be hacked in.
+            //
+            // including {self:?} into the message would be useful, but unsure how to unproject.
+            _ => std::task::Poll::Ready(Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "cloned or initial values cannot be read",
+            ))),
+        }
+    }
+}
+
+impl<S> Clone for NonSeekableStream<S> {
+    /// Weird clone implementation exists to support the sdk doing cloning before issuing the first
+    /// request, see type documentation.
+    fn clone(&self) -> Self {
+        use NonSeekableStream::*;
+
+        match self {
+            Initial { inner, len } => {
+                if let Some(inner) = inner.lock().unwrap().take() {
+                    Actual {
+                        inner,
+                        len: *len,
+                        read_any: false,
+                    }
+                } else {
+                    Self::Cloned { len_was: *len }
+                }
+            }
+            Actual { len, .. } => Cloned { len_was: *len },
+            Cloned { len_was } => Cloned { len_was: *len_was },
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl<S> azure_core::SeekableStream for NonSeekableStream<S>
+where
+    S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
+{
+    async fn reset(&mut self) -> azure_core::error::Result<()> {
+        use NonSeekableStream::*;
+
+        let msg = match self {
+            Initial { inner, .. } => {
+                if inner.get_mut().unwrap().is_some() {
+                    return Ok(());
+                } else {
+                    "reset after first clone is not supported"
+                }
+            }
+            Actual { read_any, .. } if !*read_any => return Ok(()),
+            Actual { .. } => "reset after reading is not supported",
+            Cloned { .. } => "reset after second clone is not supported",
+        };
+        Err(azure_core::error::Error::new(
+            azure_core::error::ErrorKind::Io,
+            std::io::Error::new(std::io::ErrorKind::Other, msg),
+        ))
+    }
+
+    // Note: it is not documented if this should be the total or remaining length, total passes the
+    // tests.
+    fn len(&self) -> usize {
+        use NonSeekableStream::*;
+        match self {
+            Initial { len, .. } => *len,
+            Actual { len, .. } => *len,
+            Cloned { len_was, .. } => *len_was,
+        }
+    }
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,25 +6,23 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};

+use bytes::Bytes;
+use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
-use tokio::{io, sync::Semaphore};
+use tokio::sync::Semaphore;
 use toml_edit::Item;
 use tracing::info;

@@ -34,12 +32,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
-pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -189,7 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
@@ -216,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }

 pub struct Download {
-    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
+    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -310,7 +302,7 @@ impl GenericRemoteStorage {

    pub async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -408,7 +400,7 @@ impl GenericRemoteStorage {
    /// this path is used for the remote object id conversion only.
    pub async fn upload_storage_object(
        &self,
-        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
    ) -> anyhow::Result<()> {
@@ -441,10 +433,6 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
-    pub max_concurrent_syncs: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -540,18 +528,6 @@ impl RemoteStorageConfig {

        let use_azure = container_name.is_some() && container_region.is_some();

-        let max_concurrent_syncs = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_syncs", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
-        )
-        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
-
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -633,11 +609,7 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig {
-            max_concurrent_syncs,
-            max_sync_errors,
-            storage,
-        }))
+        Ok(Some(RemoteStorageConfig { storage }))
    }
 }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -7,11 +7,14 @@
 use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};

 use anyhow::{bail, ensure, Context};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
+use futures::stream::Stream;
 use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
+use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

@@ -99,27 +102,35 @@ impl LocalFs {
        };

        // If we were given a directory, we may use it as our starting point.
-        // Otherwise, we must go up to the parent directory.  This is because
+        // Otherwise, we must go up to the first ancestor dir that exists.  This is because
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
-        match fs::metadata(full_path.clone()).await {
-            Ok(meta) => {
-                if !meta.is_dir() {
+        loop {
+            // Did we make it to the root?
+            if initial_dir.parent().is_none() {
+                anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
+            }
+
+            match fs::metadata(initial_dir.clone()).await {
+                Ok(meta) if meta.is_dir() => {
+                    // We found a directory, break
+                    break;
+                }
+                Ok(_meta) => {
                    // It's not a directory: strip back to the parent
                    initial_dir.pop();
                }
-            }
-            Err(e) if e.kind() == ErrorKind::NotFound => {
-                // It's not a file that exists: strip the prefix back to the parent directory
-                initial_dir.pop();
-            }
-            Err(e) => {
-                // Unexpected I/O error
-                anyhow::bail!(e)
+                Err(e) if e.kind() == ErrorKind::NotFound => {
+                    // It's not a file that exists: strip the prefix back to the parent directory
+                    initial_dir.pop();
+                }
+                Err(e) => {
+                    // Unexpected I/O error
+                    anyhow::bail!(e)
+                }
            }
        }
-
        // Note that Utf8PathBuf starts_with only considers full path segments, but
        // object prefixes are arbitrary strings, so we need the strings for doing
        // starts_with later.
@@ -211,7 +222,7 @@ impl RemoteStorage for LocalFs {

    async fn upload(
        &self,
-        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -244,9 +255,12 @@ impl RemoteStorage for LocalFs {
        );

        let from_size_bytes = data_size_bytes as u64;
+        let data = tokio_util::io::StreamReader::new(data);
+        let data = std::pin::pin!(data);
        let mut buffer_to_read = data.take(from_size_bytes);

-        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
+        // alternatively we could just write the bytes to a file, but local_fs is a testing utility
+        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
            .await
            .with_context(|| {
                format!(
@@ -300,7 +314,7 @@ impl RemoteStorage for LocalFs {
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = io::BufReader::new(
+            let source = ReaderStream::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&target_path)
@@ -340,16 +354,14 @@ impl RemoteStorage for LocalFs {
        }
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = io::BufReader::new(
-                fs::OpenOptions::new()
-                    .read(true)
-                    .open(&target_path)
-                    .await
-                    .with_context(|| {
-                        format!("Failed to open source file {target_path:?} to use in the download")
-                    })
-                    .map_err(DownloadError::Other)?,
-            );
+            let mut source = tokio::fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
+                .await
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?;
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
@@ -363,11 +375,13 @@ impl RemoteStorage for LocalFs {
            Ok(match end_exclusive {
                Some(end_exclusive) => Download {
                    metadata,
-                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
+                    download_stream: Box::pin(ReaderStream::new(
+                        source.take(end_exclusive - start_inclusive),
+                    )),
                },
                None => Download {
                    metadata,
-                    download_stream: Box::pin(source),
+                    download_stream: Box::pin(ReaderStream::new(source)),
                },
            })
        } else {
@@ -467,7 +481,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use bytes::Bytes;
    use camino_tempfile::tempdir;
+    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};

    async fn read_and_assert_remote_file_contents(
@@ -477,7 +493,7 @@ mod fs_tests {
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
-        let mut download = storage
+        let download = storage
            .download(remote_storage_path)
            .await
            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -486,13 +502,9 @@ mod fs_tests {
            "Unexpected metadata returned for the downloaded file"
        );

-        let mut contents = String::new();
-        download
-            .download_stream
-            .read_to_string(&mut contents)
-            .await
-            .context("Failed to read remote file contents into string")?;
-        Ok(contents)
+        let contents = aggregate(download.download_stream).await?;
+
+        String::from_utf8(contents).map_err(anyhow::Error::new)
    }

    #[tokio::test]
@@ -521,25 +533,26 @@ mod fs_tests {
        let storage = create_storage()?;

        let id = RemotePath::new(Utf8Path::new("dummy"))?;
-        let content = std::io::Cursor::new(b"12345");
+        let content = Bytes::from_static(b"12345");
+        let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));

        // Check that you get an error if the size parameter doesn't match the actual
        // size of the stream.
        storage
-            .upload(Box::new(content.clone()), 0, &id, None)
+            .upload(content(), 0, &id, None)
            .await
            .expect_err("upload with zero size succeeded");
        storage
-            .upload(Box::new(content.clone()), 4, &id, None)
+            .upload(content(), 4, &id, None)
            .await
            .expect_err("upload with too short size succeeded");
        storage
-            .upload(Box::new(content.clone()), 6, &id, None)
+            .upload(content(), 6, &id, None)
            .await
            .expect_err("upload with too large size succeeded");

        // Correct size is 5, this should succeed.
-        storage.upload(Box::new(content), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None).await?;

        Ok(())
    }
@@ -587,7 +600,7 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

-        let mut first_part_download = storage
+        let first_part_download = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
        assert!(
@@ -595,21 +608,13 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut first_part_download.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(first_part_download.download_stream).await?;
        assert_eq!(
-            first_part_local,
-            first_part_remote.as_slice(),
+            first_part_local, first_part_remote,
            "First part bytes should be returned when requested"
        );

-        let mut second_part_download = storage
+        let second_part_download = storage
            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
@@ -621,17 +626,9 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );

-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut second_part_download.download_stream,
-            &mut second_part_remote,
-        )
-        .await?;
-        second_part_remote.flush().await?;
-        let second_part_remote = second_part_remote.into_inner().into_inner();
+        let second_part_remote = aggregate(second_part_download.download_stream).await?;
        assert_eq!(
-            second_part_local,
-            second_part_remote.as_slice(),
+            second_part_local, second_part_remote,
            "Second part bytes should be returned when requested"
        );

@@ -721,17 +718,10 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, _) = uploaded_bytes.split_at(3);

-        let mut partial_download_with_metadata = storage
+        let partial_download_with_metadata = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        io::copy(
-            &mut partial_download_with_metadata.download_stream,
-            &mut first_part_remote,
-        )
-        .await?;
-        first_part_remote.flush().await?;
-        let first_part_remote = first_part_remote.into_inner().into_inner();
+        let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
        assert_eq!(
            first_part_local,
            first_part_remote.as_slice(),
@@ -807,16 +797,16 @@ mod fs_tests {
                )
            })?;

-        storage
-            .upload(Box::new(file), size, &relative_path, metadata)
-            .await?;
+        let file = tokio_util::io::ReaderStream::new(file);
+
+        storage.upload(file, size, &relative_path, metadata).await?;
        Ok(relative_path)
    }

    async fn create_file_for_upload(
        path: &Utf8Path,
        contents: &str,
-    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
+    ) -> anyhow::Result<(fs::File, usize)> {
        std::fs::create_dir_all(path.parent().unwrap())?;
        let mut file_for_writing = std::fs::OpenOptions::new()
            .write(true)
@@ -826,7 +816,7 @@ mod fs_tests {
        drop(file_for_writing);
        let file_size = path.metadata()?.len() as usize;
        Ok((
-            io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
+            fs::OpenOptions::new().read(true).open(&path).await?,
            file_size,
        ))
    }
@@ -840,4 +830,16 @@ mod fs_tests {
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
+
+    async fn aggregate(
+        stream: impl Stream<Item = std::io::Result<Bytes>>,
+    ) -> anyhow::Result<Vec<u8>> {
+        use futures::stream::StreamExt;
+        let mut out = Vec::new();
+        let mut stream = std::pin::pin!(stream);
+        while let Some(res) = stream.next().await {
+            out.extend_from_slice(&res?[..]);
+        }
+        Ok(out)
+    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,29 +4,39 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::borrow::Cow;
+use std::{
+    borrow::Cow,
+    pin::Pin,
+    sync::Arc,
+    task::{Context, Poll},
+};

-use anyhow::Context;
+use anyhow::Context as _;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig,
+    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
+    BehaviorVersion,
 };
-use aws_credential_types::cache::CredentialsCache;
+use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{Config, Region},
+    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
-use aws_smithy_http::body::SdkBody;
+use aws_smithy_async::rt::sleep::TokioSleep;
+
+use aws_smithy_types::body::SdkBody;
+use aws_smithy_types::byte_stream::ByteStream;
+use bytes::Bytes;
+use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
-use tokio::io::{self, AsyncRead};
-use tokio_util::io::ReaderStream;
-use tracing::debug;

 use super::StorageMetadata;
 use crate::{
@@ -57,7 +67,7 @@ struct GetObjectRequest {
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        debug!(
+        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
@@ -74,7 +84,6 @@ impl S3Bucket {
            // needed to access remote extensions bucket
            .or_else("token", {
                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
                WebIdentityTokenCredentialsProvider::builder()
                    .configure(&provider_conf)
                    .build()
@@ -83,16 +92,31 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

-        let mut config_builder = Config::builder()
+        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
+        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
+        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
+        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
+        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
+        let mut retry_config = RetryConfigBuilder::new();
+        retry_config
+            .set_max_attempts(Some(1))
+            .set_mode(Some(RetryMode::Adaptive));
+
+        let mut config_builder = Builder::default()
+            .behavior_version(BehaviorVersion::v2023_11_09())
            .region(region)
-            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider);
+            .identity_cache(IdentityCache::lazy().build())
+            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+            .retry_config(retry_config.build())
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl));

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
                .endpoint_url(custom_endpoint)
                .force_path_style(true);
        }
+
        let client = Client::from_conf(config_builder.build());

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
@@ -205,12 +229,15 @@ impl S3Bucket {
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
+
+                let body = object_output.body;
+                let body = ByteStreamAsStream::from(body);
+                let body = PermitCarrying::new(permit, body);
+                let body = TimedDownload::new(started_at, body);
+
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
-                        started_at,
-                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
-                    ))),
+                    download_stream: Box::pin(body),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -223,29 +250,55 @@ impl S3Bucket {
    }
 }

+pin_project_lite::pin_project! {
+    struct ByteStreamAsStream {
+        #[pin]
+        inner: aws_smithy_types::byte_stream::ByteStream
+    }
+}
+
+impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
+    fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
+        ByteStreamAsStream { inner }
+    }
+}
+
+impl Stream for ByteStreamAsStream {
+    type Item = std::io::Result<Bytes>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // this does the std::io::ErrorKind::Other conversion
+        self.project().inner.poll_next(cx).map_err(|x| x.into())
+    }
+
+    // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
+    // sense and Stream::size_hint does not really
+}
+
 pin_project_lite::pin_project! {
    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct RatelimitedAsyncRead<S> {
+    struct PermitCarrying<S> {
        permit: tokio::sync::OwnedSemaphorePermit,
        #[pin]
        inner: S,
    }
 }

-impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+impl<S> PermitCarrying<S> {
    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        RatelimitedAsyncRead { permit, inner }
+        Self { permit, inner }
    }
 }

-impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        this.inner.poll_read(cx, buf)
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
    }
 }

@@ -265,7 +318,7 @@ pin_project_lite::pin_project! {
    }
 }

-impl<S: AsyncRead> TimedDownload<S> {
+impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
@@ -275,25 +328,26 @@ impl<S: AsyncRead> TimedDownload<S> {
    }
 }

-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        use std::task::ready;
+
        let this = self.project();
-        let before = buf.filled().len();
-        let read = std::task::ready!(this.inner.poll_read(cx, buf));

-        let read_eof = buf.filled().len() == before;
-
-        match read {
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
-            Ok(()) => { /* still in progress */ }
-            Err(_) => *this.outcome = AttemptOutcome::Err,
+        let res = ready!(this.inner.poll_next(cx));
+        match &res {
+            Some(Ok(_)) => {}
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
        }

-        std::task::Poll::Ready(read)
+        Poll::Ready(res)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
    }
 }

@@ -354,11 +408,11 @@ impl RemoteStorage for S3Bucket {

            let response = response?;

-            let keys = response.contents().unwrap_or_default();
+            let keys = response.contents();
            let empty = Vec::new();
            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);

-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());

            for object in keys {
                let object_path = object.key().expect("response does not contain a key");
@@ -383,7 +437,7 @@ impl RemoteStorage for S3Bucket {

    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -393,8 +447,8 @@ impl RemoteStorage for S3Bucket {

        let started_at = start_measuring_requests(kind);

-        let body = Body::wrap_stream(ReaderStream::new(from));
-        let bytes_stream = ByteStream::new(SdkBody::from(body));
+        let body = Body::wrap_stream(from);
+        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));

        let res = self
            .client
@@ -457,7 +511,7 @@ impl RemoteStorage for S3Bucket {
        for path in paths {
            let obj_id = ObjectIdentifier::builder()
                .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build();
+                .build()?;
            delete_objects.push(obj_id);
        }

@@ -468,7 +522,11 @@ impl RemoteStorage for S3Bucket {
                .client
                .delete_objects()
                .bucket(self.bucket_name.clone())
-                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
                .send()
                .await;

--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use bytes::Bytes;
+use futures::stream::Stream;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
@@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper {

    async fn upload(
        &self,
-        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,13 +1,15 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
@@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;

    ctx.client.delete_objects(&[path1, path2]).await?;

@@ -219,53 +212,56 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let data = "remote blob data here".as_bytes();
-    let data_len = data.len() as u64;
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());

-    ctx.client
-        .upload(std::io::Cursor::new(data), data.len(), &path, None)
-        .await?;
+    let (data, len) = wrap_stream(orig.clone());

-    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
+    ctx.client.upload(data, len, &path, None).await?;
+
+    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
        let mut buf = Vec::new();
-        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
+        tokio::io::copy_buf(
+            &mut tokio_util::io::StreamReader::new(dl.download_stream),
+            &mut buf,
+        )
+        .await?;
        Ok(buf)
    }
    // Normal download request
    let dl = ctx.client.download(&path).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);

    // Full range (end specified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 0, Some(data_len))
+        .download_byte_range(&path, 0, Some(len as u64))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);

    // partial range (end specified)
    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..10]);
+    assert_eq!(&buf, &orig[4..10]);

    // partial range (end beyond real end)
    let dl = ctx
        .client
-        .download_byte_range(&path, 8, Some(data_len * 100))
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[8..]);
+    assert_eq!(&buf, &orig[8..]);

    // Partial range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..]);
+    assert_eq!(&buf, &orig[4..]);

    // Full range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);

    debug!("Cleanup: deleting file at path {path:?}");
    ctx.client
@@ -281,6 +277,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -469,8 +466,6 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
@@ -505,11 +500,8 @@ async fn upload_azure_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -590,11 +582,8 @@ async fn upload_simple_azure_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -623,3 +612,32 @@ async fn upload_simple_azure_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
+
+// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
+// to binary
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,13 +1,15 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use bytes::Bytes;
 use camino::Utf8Path;
+use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let data1 = "remote blob data1".as_bytes();
-    let data1_len = data1.len();
-    let data2 = "remote blob data2".as_bytes();
-    let data2_len = data2.len();
-    let data3 = "remote blob data3".as_bytes();
-    let data3_len = data3.len();
-    ctx.client
-        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;

-    ctx.client
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
-        .await?;
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;

    ctx.client.delete_objects(&[path1, path2]).await?;

@@ -210,6 +203,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -396,8 +390,6 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
@@ -433,11 +425,9 @@ async fn upload_s3_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -518,11 +508,9 @@ async fn upload_simple_s3_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

-            let data = format!("remote blob data {i}").into_bytes();
-            let data_len = data.len();
-            task_client
-                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
-                .await?;
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -551,3 +539,30 @@ async fn upload_simple_s3_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
+
+fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,4 +1,6 @@
 //! Synthetic size calculation
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,6 +32,8 @@
 //!         .init();
 //! }
 //! ```
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
@@ -49,6 +50,8 @@ const_format.workspace = true
 # why is it only here? no other crate should use it, streams are rarely needed.
 tokio-stream = { version = "0.1.14" }

+serde_path_to_error.workspace = true
+
 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
--- a/libs/utils/scripts/restore_from_wal_initdb.sh
+++ b/libs/utils/scripts/restore_from_wal_initdb.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# like restore_from_wal.sh, but takes existing initdb.tar.zst
+
+set -euxo pipefail
+
+PG_BIN=$1
+WAL_PATH=$2
+DATA_DIR=$3
+PORT=$4
+echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
+echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
+REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
+declare -i WAL_SIZE=$REDO_POS+114
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
+for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
+dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+rm -f 000000010000000000000001
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,8 @@
 // For details about authentication see docs/authentication.md

+use arc_swap::ArcSwap;
 use serde;
-use std::fs;
+use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
 use camino::Utf8Path;
@@ -10,7 +11,7 @@ use jsonwebtoken::{
 };
 use serde::{Deserialize, Serialize};

-use crate::id::TenantId;
+use crate::{http::error::ApiError, id::TenantId};

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -44,31 +45,106 @@ impl Claims {
    }
 }

+pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
+
+impl SwappableJwtAuth {
+    pub fn new(jwt_auth: JwtAuth) -> Self {
+        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
+    }
+    pub fn swap(&self, jwt_auth: JwtAuth) {
+        self.0.swap(Arc::new(jwt_auth));
+    }
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        self.0.load().decode(token)
+    }
+}
+
+impl std::fmt::Debug for SwappableJwtAuth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Swappable({:?})", self.0.load())
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct AuthError(pub Cow<'static, str>);
+
+impl Display for AuthError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 pub struct JwtAuth {
-    decoding_key: DecodingKey,
+    decoding_keys: Vec<DecodingKey>,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_key: DecodingKey) -> Self {
+    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_key,
+            decoding_keys,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
+        let metadata = key_path.metadata()?;
+        let decoding_keys = if metadata.is_dir() {
+            let mut keys = Vec::new();
+            for entry in fs::read_dir(key_path)? {
+                let path = entry?.path();
+                if !path.is_file() {
+                    // Ignore directories (don't recurse)
+                    continue;
+                }
+                let public_key = fs::read(path)?;
+                keys.push(DecodingKey::from_ed_pem(&public_key)?);
+            }
+            keys
+        } else if metadata.is_file() {
+            let public_key = fs::read(key_path)?;
+            vec![DecodingKey::from_ed_pem(&public_key)?]
+        } else {
+            anyhow::bail!("path is neither a directory or a file")
+        };
+        if decoding_keys.is_empty() {
+            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
+        }
+        Ok(Self::new(decoding_keys))
    }

-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        Ok(decode(token, &self.decoding_key, &self.validation)?)
+    /// Attempt to decode the token with the internal decoding keys.
+    ///
+    /// The function tries the stored decoding keys in succession,
+    /// and returns the first yielding a successful result.
+    /// If there is no working decoding key, it returns the last error.
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        let mut res = None;
+        for decoding_key in &self.decoding_keys {
+            res = Some(decode(token, decoding_key, &self.validation));
+            if let Some(Ok(res)) = res {
+                return Ok(res);
+            }
+        }
+        if let Some(res) = res {
+            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
+        } else {
+            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
+        }
    }
 }

@@ -108,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;

    #[test]
-    fn test_decode() -> Result<(), anyhow::Error> {
+    fn test_decode() {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

@@ -129,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
        assert_eq!(claims_from_token, expected_claims);
-
-        Ok(())
    }

    #[test]
-    fn test_encode() -> Result<(), anyhow::Error> {
+    fn test_encode() {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();

        // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let decoded = auth.decode(&encoded)?;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let decoded = auth.decode(&encoded).unwrap();

        assert_eq!(decoded.claims, claims);
-
-        Ok(())
    }
 }
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -1,16 +1,14 @@
-use std::sync::Arc;
-
-use tokio::sync::{mpsc, Mutex};
+use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};

 /// While a reference is kept around, the associated [`Barrier::wait`] will wait.
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(mpsc::Sender<()>);
+pub struct Completion(TaskTrackerToken);

 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
-pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+pub struct Barrier(TaskTracker);

 impl Default for Barrier {
    fn default() -> Self {
@@ -21,7 +19,7 @@ impl Default for Barrier {

 impl Barrier {
    pub async fn wait(self) {
-        self.0.lock().await.recv().await;
+        self.0.wait().await;
    }

    pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -33,8 +31,7 @@ impl Barrier {

 impl PartialEq for Barrier {
    fn eq(&self, other: &Self) -> bool {
-        // we don't use dyn so this is good
-        Arc::ptr_eq(&self.0, &other.0)
+        TaskTracker::ptr_eq(&self.0, &other.0)
    }
 }

@@ -42,8 +39,10 @@ impl Eq for Barrier {}

 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
-    let (tx, rx) = mpsc::channel::<()>(1);
-    let rx = Mutex::new(rx);
-    let rx = Arc::new(rx);
-    (Completion(tx), Barrier(rx))
+    let tracker = TaskTracker::new();
+    // otherwise wait never exits
+    tracker.close();
+
+    let token = tracker.token();
+    (Completion(token), Barrier(tracker))
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -152,3 +152,16 @@ impl Debug for Generation {
        }
    }
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn generation_gt() {
+        // Important that a None generation compares less than a valid one, during upgrades from
+        // pre-generation systems.
+        assert!(Generation::none() < Generation::new(0));
+        assert!(Generation::none() < Generation::new(1));
+    }
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, JwtAuth};
+use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth
-                        .decode(token)
-                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
+                    let data = auth.decode(token).map_err(|err| {
+                        warn!("Authentication error: {err}");
+                        // Rely on From<AuthError> for ApiError impl
+                        err
+                    })?;
                    req.set_context(data.claims);
                }
                None => {
@@ -450,12 +452,11 @@ where

 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
+    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => {
-            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
-        }
+        Some(claims) => Ok(check_permission(&claims)
+            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::{error, info, warn};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors

    match api_error {
+        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
+            warn!("Error processing HTTP request: {api_error:#}")
+        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -25,8 +25,12 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
    if body.remaining() == 0 {
        return Ok(None);
    }
-    serde_json::from_reader(body.reader())
-        .context("Failed to parse json request")
+
+    let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
+
+    serde_path_to_error::deserialize(&mut deser)
+        // intentionally stringify because the debug version is not helpful in python logs
+        .map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
        .map(Some)
        .map_err(ApiError::BadRequest)
 }
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -120,6 +120,8 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
+
+        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,5 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

@@ -77,6 +78,9 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+/// async timeout helper
+pub mod timeout;
+
 pub mod sync;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,6 +1,7 @@
 use std::str::FromStr;

 use anyhow::Context;
+use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, EnumVariantNames};

@@ -24,16 +25,48 @@ impl LogFormat {
    }
 }

-static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
-    metrics::register_int_counter_vec!(
+struct TracingEventCountMetric {
+    error: IntCounter,
+    warn: IntCounter,
+    info: IntCounter,
+    debug: IntCounter,
+    trace: IntCounter,
+}
+
+static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
+    let vec = metrics::register_int_counter_vec!(
        "libmetrics_tracing_event_count",
        "Number of tracing events, by level",
        &["level"]
    )
-    .expect("failed to define metric")
+    .expect("failed to define metric");
+    TracingEventCountMetric::new(vec)
 });

-struct TracingEventCountLayer(&'static metrics::IntCounterVec);
+impl TracingEventCountMetric {
+    fn new(vec: IntCounterVec) -> Self {
+        Self {
+            error: vec.with_label_values(&["error"]),
+            warn: vec.with_label_values(&["warn"]),
+            info: vec.with_label_values(&["info"]),
+            debug: vec.with_label_values(&["debug"]),
+            trace: vec.with_label_values(&["trace"]),
+        }
+    }
+
+    fn inc_for_level(&self, level: tracing::Level) {
+        let counter = match level {
+            tracing::Level::ERROR => &self.error,
+            tracing::Level::WARN => &self.warn,
+            tracing::Level::INFO => &self.info,
+            tracing::Level::DEBUG => &self.debug,
+            tracing::Level::TRACE => &self.trace,
+        };
+        counter.inc();
+    }
+}
+
+struct TracingEventCountLayer(&'static TracingEventCountMetric);

 impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
 where
@@ -44,15 +77,7 @@ where
        event: &tracing::Event<'_>,
        _ctx: tracing_subscriber::layer::Context<'_, S>,
    ) {
-        let level = event.metadata().level();
-        let level = match *level {
-            tracing::Level::ERROR => "error",
-            tracing::Level::WARN => "warn",
-            tracing::Level::INFO => "info",
-            tracing::Level::DEBUG => "debug",
-            tracing::Level::TRACE => "trace",
-        };
-        self.0.with_label_values(&[level]).inc();
+        self.0.inc_for_level(*event.metadata().level());
    }
 }

@@ -66,9 +91,17 @@ pub enum TracingErrorLayerEnablement {
    EnableWithRustLogFilter,
 }

+/// Where the logging should output to.
+#[derive(Clone, Copy)]
+pub enum Output {
+    Stdout,
+    Stderr,
+}
+
 pub fn init(
    log_format: LogFormat,
    tracing_error_layer_enablement: TracingErrorLayerEnablement,
+    output: Output,
 ) -> anyhow::Result<()> {
    // We fall back to printing all spans at info-level or above if
    // the RUST_LOG environment variable is not set.
@@ -85,7 +118,12 @@ pub fn init(
        let log_layer = tracing_subscriber::fmt::layer()
            .with_target(false)
            .with_ansi(false)
-            .with_writer(std::io::stdout);
+            .with_writer(move || -> Box<dyn std::io::Write> {
+                match output {
+                    Output::Stdout => Box::new(std::io::stdout()),
+                    Output::Stderr => Box::new(std::io::stderr()),
+                }
+            });
        let log_layer = match log_format {
            LogFormat::Json => log_layer.json().boxed(),
            LogFormat::Plain => log_layer.boxed(),
@@ -93,7 +131,9 @@ pub fn init(
        };
        log_layer.with_filter(rust_log_env_filter())
    });
-    let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
+    let r = r.with(
+        TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
+    );
    match tracing_error_layer_enablement {
        TracingErrorLayerEnablement::EnableWithRustLogFilter => r
            .with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -244,14 +284,14 @@ impl std::fmt::Debug for SecretString {
 mod tests {
    use metrics::{core::Opts, IntCounterVec};

-    use super::TracingEventCountLayer;
+    use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};

    #[test]
    fn tracing_event_count_metric() {
        let counter_vec =
            IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
-        let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
-        let layer = TracingEventCountLayer(counter_vec);
+        let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
+        let layer = TracingEventCountLayer(metric);
        use tracing_subscriber::prelude::*;

        tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,6 +125,9 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();

+            // Block any future waiters from starting
+            internal.shutdown = true;
+
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,6 +1,7 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) {
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -1,10 +1,10 @@
 //!
 //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
 //! similar to a lock, but it allows readers to "hold on" to an old value of RCU
-//! without blocking writers, and allows writing a new values without blocking
-//! readers. When you update the new value, the new value is immediately visible
+//! without blocking writers, and allows writing a new value without blocking
+//! readers. When you update the value, the new value is immediately visible
 //! to new readers, but the update waits until all existing readers have
-//! finishe, so that no one sees the old value anymore.
+//! finished, so that on return, no one sees the old value anymore.
 //!
 //! This implementation isn't wait-free; it uses an RwLock that is held for a
 //! short duration when the value is read or updated.
@@ -26,6 +26,7 @@
 //! Increment the value by one, and wait for old readers to finish:
 //!
 //! ```
+//! # async fn dox() {
 //! # let rcu = utils::simple_rcu::Rcu::new(1);
 //! let write_guard = rcu.lock_for_write();
 //!
@@ -36,15 +37,17 @@
 //!
 //! // Concurrent reads and writes are now possible again. Wait for all the readers
 //! // that still observe the old value to finish.
-//! waitlist.wait();
+//! waitlist.wait().await;
+//! # }
 //! ```
 //!
 #![warn(missing_docs)]

 use std::ops::Deref;
-use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
 use std::sync::{Arc, Weak};
-use std::sync::{Mutex, RwLock, RwLockWriteGuard};
+use std::sync::{RwLock, RwLockWriteGuard};
+
+use tokio::sync::watch;

 ///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -68,22 +71,21 @@ struct RcuCell<V> {
    value: V,

    /// A dummy channel. We never send anything to this channel. The point is
-    /// that when the RcuCell is dropped, any cloned Senders will be notified
+    /// that when the RcuCell is dropped, any subscribed Receivers will be notified
    /// that the channel is closed. Updaters can use this to wait out until the
    /// RcuCell has been dropped, i.e. until the old value is no longer in use.
    ///
-    /// We never do anything with the receiver, we just need to hold onto it so
-    /// that the Senders will be notified when it's dropped. But because it's
-    /// not Sync, we need a Mutex on it.
-    watch: (SyncSender<()>, Mutex<Receiver<()>>),
+    /// We never send anything to this, we just need to hold onto it so that the
+    /// Receivers will be notified when it's dropped.
+    watch: watch::Sender<()>,
 }

 impl<V> RcuCell<V> {
    fn new(value: V) -> Self {
-        let (watch_sender, watch_receiver) = sync_channel(0);
+        let (watch_sender, _) = watch::channel(());
        RcuCell {
            value,
-            watch: (watch_sender, Mutex::new(watch_receiver)),
+            watch: watch_sender,
        }
    }
 }
@@ -141,10 +143,10 @@ impl<V> Deref for RcuReadGuard<V> {
 ///
 /// Write guard returned by `write`
 ///
-/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
-/// it should only be held for a short duration!
+/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
+/// held for a short duration!
 ///
-/// Calling `store` consumes the guard, making new reads and new writes possible
+/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
 /// again.
 ///
 pub struct RcuWriteGuard<'a, V> {
@@ -179,7 +181,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
            // the watches for any that do.
            self.inner.old_cells.retain(|weak| {
                if let Some(cell) = weak.upgrade() {
-                    watches.push(cell.watch.0.clone());
+                    watches.push(cell.watch.subscribe());
                    true
                } else {
                    false
@@ -193,20 +195,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
 ///
 /// List of readers who can still see old values.
 ///
-pub struct RcuWaitList(Vec<SyncSender<()>>);
+pub struct RcuWaitList(Vec<watch::Receiver<()>>);

 impl RcuWaitList {
    ///
    /// Wait for old readers to finish.
    ///
-    pub fn wait(mut self) {
+    pub async fn wait(mut self) {
        // after all the old_cells are no longer in use, we're done
        for w in self.0.iter_mut() {
            // This will block until the Receiver is closed. That happens when
            // the RcuCell is dropped.
            #[allow(clippy::single_match)]
-            match w.send(()) {
-                Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
+            match w.changed().await {
+                Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
                Err(_) => {
                    // closed, which means that the cell has been dropped, and
                    // its value is no longer in use
@@ -220,11 +222,10 @@ impl RcuWaitList {
 mod tests {
    use super::*;
    use std::sync::{Arc, Mutex};
-    use std::thread::{sleep, spawn};
    use std::time::Duration;

-    #[test]
-    fn two_writers() {
+    #[tokio::test]
+    async fn two_writers() {
        let rcu = Rcu::new(1);

        let read1 = rcu.read();
@@ -248,33 +249,35 @@ mod tests {
        assert_eq!(*read1, 1);

        let log = Arc::new(Mutex::new(Vec::new()));
-        // Wait for the old readers to finish in separate threads.
+        // Wait for the old readers to finish in separate tasks.
        let log_clone = Arc::clone(&log);
-        let thread2 = spawn(move || {
-            wait2.wait();
+        let task2 = tokio::spawn(async move {
+            wait2.wait().await;
            log_clone.lock().unwrap().push("wait2 done");
        });
        let log_clone = Arc::clone(&log);
-        let thread3 = spawn(move || {
-            wait3.wait();
+        let task3 = tokio::spawn(async move {
+            wait3.wait().await;
            log_clone.lock().unwrap().push("wait3 done");
        });

        // without this sleep the test can pass on accident if the writer is slow
-        sleep(Duration::from_millis(500));
+        tokio::time::sleep(Duration::from_millis(100)).await;

        // Release first reader. This allows first write to finish, but calling
-        // wait() on the second one would still block.
+        // wait() on the 'task3' would still block.
        log.lock().unwrap().push("dropping read1");
        drop(read1);
-        thread2.join().unwrap();
+        task2.await.unwrap();

-        sleep(Duration::from_millis(500));
+        assert!(!task3.is_finished());
+
+        tokio::time::sleep(Duration::from_millis(100)).await;

        // Release second reader, and finish second writer.
        log.lock().unwrap().push("dropping read2");
        drop(read2);
-        thread3.join().unwrap();
+        task3.await.unwrap();

        assert_eq!(
            log.lock().unwrap().as_slice(),
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1 +1,3 @@
 pub mod heavier_once_cell;
+
+pub mod gate;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -0,0 +1,172 @@
+use std::{sync::Arc, time::Duration};
+
+/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
+///
+/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
+/// the resource calls `close()` when they want to ensure that all holders of guards
+/// have released them, and that no future guards will be issued.
+pub struct Gate {
+    /// Each caller of enter() takes one unit from the semaphore. In close(), we
+    /// take all the units to ensure all GateGuards are destroyed.
+    sem: Arc<tokio::sync::Semaphore>,
+
+    /// For observability only: a name that will be used to log warnings if a particular
+    /// gate is holding up shutdown
+    name: String,
+}
+
+/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
+/// not complete.
+#[derive(Debug)]
+pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
+
+/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
+async fn warn_if_stuck<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_period: std::time::Duration,
+) -> <Fut as std::future::Future>::Output {
+    let started = std::time::Instant::now();
+
+    let mut fut = std::pin::pin!(fut);
+
+    let mut warned = false;
+    let ret = loop {
+        match tokio::time::timeout(warn_period, &mut fut).await {
+            Ok(ret) => break ret,
+            Err(_) => {
+                tracing::warn!(
+                    gate = name,
+                    elapsed_ms = started.elapsed().as_millis(),
+                    "still waiting, taking longer than expected..."
+                );
+                warned = true;
+            }
+        }
+    };
+
+    // If we emitted a warning for slowness, also emit a message when we complete, so that
+    // someone debugging a shutdown can know for sure whether we have moved past this operation.
+    if warned {
+        tracing::info!(
+            gate = name,
+            elapsed_ms = started.elapsed().as_millis(),
+            "completed, after taking longer than expected"
+        )
+    }
+
+    ret
+}
+
+#[derive(Debug)]
+pub enum GateError {
+    GateClosed,
+}
+
+impl Gate {
+    const MAX_UNITS: u32 = u32::MAX;
+
+    pub fn new(name: String) -> Self {
+        Self {
+            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
+            name,
+        }
+    }
+
+    /// Acquire a guard that will prevent close() calls from completing. If close()
+    /// was already called, this will return an error which should be interpreted
+    /// as "shutting down".
+    ///
+    /// This function would typically be used from e.g. request handlers. While holding
+    /// the guard returned from this function, it is important to respect a CancellationToken
+    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
+    /// also contain a CancellationToken.
+    pub fn enter(&self) -> Result<GateGuard, GateError> {
+        self.sem
+            .clone()
+            .try_acquire_owned()
+            .map(GateGuard)
+            .map_err(|_| GateError::GateClosed)
+    }
+
+    /// Types with a shutdown() method and a gate should call this method at the
+    /// end of shutdown, to ensure that all GateGuard holders are done.
+    ///
+    /// This will wait for all guards to be destroyed.  For this to complete promptly, it is
+    /// important that the holders of such guards are respecting a CancellationToken which has
+    /// been cancelled before entering this function.
+    pub async fn close(&self) {
+        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
+    }
+
+    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
+    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
+    /// the CancellationToken on such types is analogous to "Did shutdown start?"
+    pub fn close_complete(&self) -> bool {
+        self.sem.is_closed()
+    }
+
+    async fn do_close(&self) {
+        tracing::debug!(gate = self.name, "Closing Gate...");
+        match self.sem.acquire_many(Self::MAX_UNITS).await {
+            Ok(_units) => {
+                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
+                self.sem.close();
+            }
+            Err(_) => {
+                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
+                // This is legal.  Timeline::shutdown for example is not protected from being called more than
+                // once.
+                tracing::debug!(gate = self.name, "Double close")
+            }
+        }
+        tracing::debug!(gate = self.name, "Closed Gate.")
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use futures::FutureExt;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_idle_gate() {
+        // Having taken no gates, we should not be blocked in close
+        let gate = Gate::new("test".to_string());
+        gate.close().await;
+
+        // If a guard is dropped before entering, close should not be blocked
+        let gate = Gate::new("test".to_string());
+        let guard = gate.enter().unwrap();
+        drop(guard);
+        gate.close().await;
+
+        // Entering a closed guard fails
+        gate.enter().expect_err("enter should fail after close");
+    }
+
+    #[tokio::test]
+    async fn test_busy_gate() {
+        let gate = Gate::new("test".to_string());
+
+        let guard = gate.enter().unwrap();
+
+        let mut close_fut = std::pin::pin!(gate.close());
+
+        // Close should be blocked
+        assert!(close_fut.as_mut().now_or_never().is_none());
+
+        // Attempting to enter() should fail, even though close isn't done yet.
+        gate.enter()
+            .expect_err("enter should fail after entering close");
+
+        drop(guard);
+
+        // Guard is gone, close should finish
+        assert!(close_fut.as_mut().now_or_never().is_some());
+
+        // Attempting to enter() is still forbidden
+        gate.enter().expect_err("enter should fail finishing close");
+    }
+}
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -0,0 +1,37 @@
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+
+pub enum TimeoutCancellableError {
+    Timeout,
+    Cancelled,
+}
+
+/// Wrap [`tokio::time::timeout`] with a CancellationToken.
+///
+/// This wrapper is appropriate for any long running operation in a task
+/// that ought to respect a CancellationToken (which means most tasks).
+///
+/// The only time you should use a bare tokio::timeout is when the future `F`
+/// itself respects a CancellationToken: otherwise, always use this wrapper
+/// with your CancellationToken to ensure that your task does not hold up
+/// graceful shutdown.
+pub async fn timeout_cancellable<F>(
+    duration: Duration,
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<F::Output, TimeoutCancellableError>
+where
+    F: std::future::Future,
+{
+    tokio::select!(
+        r = tokio::time::timeout(duration, future) => {
+            r.map_err(|_| TimeoutCancellableError::Timeout)
+
+        },
+        _ = cancel.cancelled() => {
+            Err(TimeoutCancellableError::Cancelled)
+
+        }
+    )
+}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,13 +19,12 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
    }
 }

+#[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
    sk: *mut Safekeeper,
    buf: *mut ::std::os::raw::c_char,
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
 }

 /// Take ownership of `Vec<u8>` from StringInfoData.
+#[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
    if pg.data.is_null() {
        return None;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
            .unwrap()
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

@@ -436,9 +436,9 @@ mod tests {
                event_mask: 0,
            }),
            expected_messages: vec![
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
        // walproposer will panic when it finishes sync_safekeepers
        std::panic::catch_unwind(|| wp.start()).unwrap_err();
        // validate the resulting LSN
-        assert_eq!(receiver.recv()?, 1337);
+        assert_eq!(receiver.try_recv(), Ok(1337));
        Ok(())
        // drop() will free up resources here
    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -36,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
@@ -51,6 +52,7 @@ regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
+serde_path_to_error.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -3,6 +3,7 @@ use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
+use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
@@ -211,7 +212,7 @@ fn bench_sequential(c: &mut Criterion) {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
        let layer = PersistentLayerDesc::new_img(
-            TenantId::generate(),
+            TenantShardId::unsharded(TenantId::generate()),
            TimelineId::generate(),
            zero.add(10 * i32)..zero.add(10 * i32 + 1),
            Lsn(i),
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -18,3 +18,5 @@ tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
+serde.workspace = true
+serde_json.workspace = true
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -0,0 +1,38 @@
+use std::collections::HashMap;
+
+use anyhow::Context;
+use camino::Utf8PathBuf;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
+use utils::lsn::Lsn;
+
+#[derive(clap::Subcommand)]
+pub(crate) enum IndexPartCmd {
+    Dump { path: Utf8PathBuf },
+}
+
+pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
+    match cmd {
+        IndexPartCmd::Dump { path } => {
+            let bytes = tokio::fs::read(path).await.context("read file")?;
+            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
+            #[derive(serde::Serialize)]
+            struct Output<'a> {
+                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
+                disk_consistent_lsn: Lsn,
+                timeline_metadata: &'a TimelineMetadata,
+            }
+
+            let output = Output {
+                layer_metadata: &des.layer_metadata,
+                disk_consistent_lsn: des.get_disk_consistent_lsn(),
+                timeline_metadata: &des.metadata,
+            };
+
+            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
+            println!("{output}");
+            Ok(())
+        }
+    }
+}
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -1,13 +1,15 @@
 use std::path::{Path, PathBuf};

 use anyhow::Result;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::tenant::storage_layer::{delta_layer, image_layer};
+use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
@@ -20,6 +22,7 @@ use pageserver::{
 };
 use std::fs;
 use utils::bin_ser::BeSer;
+use utils::id::{TenantId, TimelineId};

 use crate::layer_map_analyzer::parse_filename;

@@ -45,6 +48,13 @@ pub(crate) enum LayerCmd {
        /// The id from list-layer command
        id: usize,
    },
+    RewriteSummary {
+        layer_file_path: Utf8PathBuf,
+        #[clap(long)]
+        new_tenant_id: Option<TenantId>,
+        #[clap(long)]
+        new_timeline_id: Option<TimelineId>,
+    },
 }

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
@@ -100,6 +110,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    println!("- timeline {}", timeline.file_name().to_string_lossy());
                }
            }
+            Ok(())
        }
        LayerCmd::ListLayer {
            path,
@@ -128,6 +139,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    idx += 1;
                }
            }
+            Ok(())
        }
        LayerCmd::DumpLayer {
            path,
@@ -168,7 +180,63 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    idx += 1;
                }
            }
+            Ok(())
+        }
+        LayerCmd::RewriteSummary {
+            layer_file_path,
+            new_tenant_id,
+            new_timeline_id,
+        } => {
+            pageserver::virtual_file::init(10);
+            pageserver::page_cache::init(100);
+
+            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+
+            macro_rules! rewrite_closure {
+                ($($summary_ty:tt)*) => {{
+                    |summary| $($summary_ty)* {
+                        tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
+                        timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
+                        ..summary
+                    }
+                }};
+            }
+
+            let res = ImageLayer::rewrite_summary(
+                layer_file_path,
+                rewrite_closure!(image_layer::Summary),
+                &ctx,
+            )
+            .await;
+            match res {
+                Ok(()) => {
+                    println!("Successfully rewrote summary of image layer {layer_file_path}");
+                    return Ok(());
+                }
+                Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
+                Err(image_layer::RewriteSummaryError::Other(e)) => {
+                    return Err(e);
+                }
+            }
+
+            let res = DeltaLayer::rewrite_summary(
+                layer_file_path,
+                rewrite_closure!(delta_layer::Summary),
+                &ctx,
+            )
+            .await;
+            match res {
+                Ok(()) => {
+                    println!("Successfully rewrote summary of delta layer {layer_file_path}");
+                    return Ok(());
+                }
+                Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
+                Err(delta_layer::RewriteSummaryError::Other(e)) => {
+                    return Err(e);
+                }
+            }
+
+            anyhow::bail!("not an image or delta layer: {layer_file_path}");
        }
    }
-    Ok(())
 }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -5,11 +5,13 @@
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.

 mod draw_timeline_dir;
+mod index_part;
 mod layer_map_analyzer;
 mod layers;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
    context::{DownloadBehavior, RequestContext},
@@ -38,6 +40,8 @@ struct CliOpts {
 #[derive(Subcommand)]
 enum Commands {
    Metadata(MetadataCmd),
+    #[command(subcommand)]
+    IndexPart(IndexPartCmd),
    PrintLayerFile(PrintLayerFileCmd),
    DrawTimeline {},
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
@@ -83,6 +87,9 @@ async fn main() -> anyhow::Result<()> {
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
        }
+        Commands::IndexPart(cmd) => {
+            index_part::main(&cmd).await?;
+        }
        Commands::DrawTimeline {} => {
            draw_timeline_dir::main()?;
        }
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,22 +1,21 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => {
-            bail!("SafekeeperData scope makes no sense for Pageserver")
-        }
+        (Scope::SafekeeperData, _) => Err(AuthError(
+            "SafekeeperData scope makes no sense for Pageserver".into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
-use pageserver::tenant::TenantSharedResources;
+use pageserver::tenant::{secondary, TenantSharedResources};
 use remote_storage::GenericRemoteStorage;
 use tokio::time::Instant;
 use tracing::*;
@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::{JwtAuth, SwappableJwtAuth},
+    logging, project_build_tag, project_git_version,
+    sentry_init::init_sentry,
+    signals::Signal,
+    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -100,7 +103,11 @@ fn main() -> anyhow::Result<()> {
    } else {
        TracingErrorLayerEnablement::Disabled
    };
-    logging::init(conf.log_format, tracing_error_layer_enablement)?;
+    logging::init(
+        conf.log_format,
+        tracing_error_layer_enablement,
+        logging::Output::Stdout,
+    )?;

    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
    // disarming this hook on pageserver, because we never tear down tracing.
@@ -321,13 +328,12 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        // unwrap is ok because check is performed when creating config, so path is set and exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!(
-            "Loading public key for verifying JWT tokens from {:#?}",
-            key_path
-        );
-        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
+        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+
+        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -396,21 +402,17 @@ fn start_pageserver(
    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

-    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
-
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
-        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -420,9 +422,9 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
+    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx;
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -457,7 +459,7 @@ fn start_pageserver(
            });

            let WaitForPhaseResult {
-                timeout_remaining: timeout,
+                timeout_remaining: _timeout,
                skipped: init_load_skipped,
            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

@@ -465,26 +467,6 @@ fn start_pageserver(

            scopeguard::ScopeGuard::into_inner(guard);

-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before initial logical sizes completed")
-            });
-
-            let logical_sizes_done = std::pin::pin!(async {
-                init_logical_size_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_logical_sizes",
-                    "Initial logical sizes completed",
-                );
-            });
-
-            let WaitForPhaseResult {
-                timeout_remaining: _,
-                skipped: logical_sizes_skipped,
-            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
-
-            scopeguard::ScopeGuard::into_inner(guard);
-
            // allow background jobs to start: we either completed prior stages, or they reached timeout
            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
            // because things like consumption metrics for billing are blocked by this barrier.
@@ -507,9 +489,6 @@ fn start_pageserver(
            if let Some(f) = init_load_skipped {
                f.await;
            }
-            if let Some(f) = logical_sizes_skipped {
-                f.await;
-            }
            scopeguard::ScopeGuard::into_inner(guard);

            startup_checkpoint(started_startup_at, "complete", "Startup complete");
@@ -525,6 +504,17 @@ fn start_pageserver(
        }
    });

+    let secondary_controller = if let Some(remote_storage) = &remote_storage {
+        secondary::spawn_tasks(
+            tenant_manager.clone(),
+            remote_storage.clone(),
+            background_jobs_barrier.clone(),
+            shutdown_pageserver.clone(),
+        )
+    } else {
+        secondary::null_controller()
+    };
+
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
    // is still accessible even if background task is not configured as long as remote storage has
@@ -548,11 +538,13 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
                disk_usage_eviction_state,
                deletion_queue.new_client(),
+                secondary_controller,
            )
            .context("Failed to initialize router state")?,
        );
@@ -579,7 +571,6 @@ fn start_pageserver(
    }

    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-        let background_jobs_barrier = background_jobs_barrier;
        let metrics_ctx = RequestContext::todo_child(
            TaskKind::MetricsCollection,
            // This task itself shouldn't download anything.
@@ -617,6 +608,7 @@ fn start_pageserver(
                    conf.synthetic_size_calculation_interval,
                    conf.id,
                    local_disk_storage,
+                    cancel,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,6 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde::de::IntoDeserializer;
 use std::env;
@@ -25,7 +26,7 @@ use toml_edit::{Document, Item};
 use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
 use utils::{
-    id::{NodeId, TenantId, TimelineId},
+    id::{NodeId, TimelineId},
    logging::LogFormat,
 };

@@ -69,6 +70,8 @@ pub mod defaults {
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

+    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+
    ///
    /// Default built-in configuration file.
    ///
@@ -116,6 +119,8 @@ pub mod defaults {
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
 #gc_feedback = false

+#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
+
 [remote_storage]

 "#
@@ -161,7 +166,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file containing public key for verifying JWT tokens.
+    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

@@ -214,6 +219,10 @@ pub struct PageServerConf {
    /// If true, pageserver will make best-effort to operate without a control plane: only
    /// for use in major incidents.
    pub control_plane_emergency_mode: bool,
+
+    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
+    /// heatmap uploads vs. other remote storage operations.
+    pub heatmap_upload_concurrency: usize,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -292,6 +301,8 @@ struct PageServerConfigBuilder {
    control_plane_api: BuilderValue<Option<Url>>,
    control_plane_api_token: BuilderValue<Option<SecretString>>,
    control_plane_emergency_mode: BuilderValue<bool>,
+
+    heatmap_upload_concurrency: BuilderValue<usize>,
 }

 impl Default for PageServerConfigBuilder {
@@ -360,6 +371,8 @@ impl Default for PageServerConfigBuilder {
            control_plane_api: Set(None),
            control_plane_api_token: Set(None),
            control_plane_emergency_mode: Set(false),
+
+            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
        }
    }
 }
@@ -500,6 +513,10 @@ impl PageServerConfigBuilder {
        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
    }

+    pub fn heatmap_upload_concurrency(&mut self, value: usize) {
+        self.heatmap_upload_concurrency = BuilderValue::Set(value)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -594,6 +611,10 @@ impl PageServerConfigBuilder {
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+
+            heatmap_upload_concurrency: self
+                .heatmap_upload_concurrency
+                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
        })
    }
 }
@@ -628,12 +649,13 @@ impl PageServerConf {
        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
    }

-    pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenants_path().join(tenant_id.to_string())
+    pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenants_path().join(tenant_shard_id.to_string())
    }

-    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
+    pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(IGNORED_TENANT_FILE_NAME)
    }

    /// Points to a place in pageserver's local directory,
@@ -641,47 +663,53 @@ impl PageServerConf {
    ///
    /// Legacy: superseded by tenant_location_config_path.  Eventually
    /// remove this function.
-    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
+    pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
    }

-    pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
+    pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
            .join(TENANT_LOCATION_CONFIG_NAME)
    }

-    pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
+    pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(TIMELINES_SEGMENT_NAME)
    }

-    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
-        self.timelines_path(tenant_id).join(timeline_id.to_string())
+    pub fn timeline_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Utf8PathBuf {
+        self.timelines_path(tenant_shard_id)
+            .join(timeline_id.to_string())
    }

    pub fn timeline_uninit_mark_file_path(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Utf8PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
+            self.timeline_path(&tenant_shard_id, &timeline_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
        )
    }

    pub fn timeline_delete_mark_file_path(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Utf8PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
+            self.timeline_path(&tenant_shard_id, &timeline_id),
            TIMELINE_DELETE_MARK_SUFFIX,
        )
    }

-    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
+    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }

@@ -691,20 +719,24 @@ impl PageServerConf {

    pub fn trace_path(
        &self,
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        connection_id: &ConnectionId,
    ) -> Utf8PathBuf {
        self.traces_path()
-            .join(tenant_id.to_string())
+            .join(tenant_shard_id.to_string())
            .join(timeline_id.to_string())
            .join(connection_id.to_string())
    }

    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
-        self.timeline_path(tenant_id, timeline_id)
+    pub fn metadata_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Utf8PathBuf {
+        self.timeline_path(tenant_shard_id, timeline_id)
            .join(METADATA_FILE_NAME)
    }

@@ -767,7 +799,7 @@ impl PageServerConf {
                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
                }
                "tenant_config" => {
-                    t_conf = Self::parse_toml_tenant_conf(item)?;
+                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
@@ -816,7 +848,9 @@ impl PageServerConf {
                },
                "control_plane_emergency_mode" => {
                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
-
+                },
+                "heatmap_upload_concurrency" => {
+                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
@@ -841,114 +875,10 @@ impl PageServerConf {
        Ok(conf)
    }

-    // subroutine of parse_and_validate to parse `[tenant_conf]` section
-
-    pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
-        let mut t_conf: TenantConfOpt = Default::default();
-        if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
-            t_conf.checkpoint_distance =
-                Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
-        }
-
-        if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
-            t_conf.checkpoint_timeout = Some(parse_toml_duration(
-                "checkpoint_timeout",
-                checkpoint_timeout,
-            )?);
-        }
-
-        if let Some(compaction_target_size) = item.get("compaction_target_size") {
-            t_conf.compaction_target_size = Some(parse_toml_u64(
-                "compaction_target_size",
-                compaction_target_size,
-            )?);
-        }
-
-        if let Some(compaction_period) = item.get("compaction_period") {
-            t_conf.compaction_period =
-                Some(parse_toml_duration("compaction_period", compaction_period)?);
-        }
-
-        if let Some(compaction_threshold) = item.get("compaction_threshold") {
-            t_conf.compaction_threshold =
-                Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
-        }
-
-        if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
-            t_conf.image_creation_threshold = Some(
-                parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
-            );
-        }
-
-        if let Some(gc_horizon) = item.get("gc_horizon") {
-            t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
-        }
-
-        if let Some(gc_period) = item.get("gc_period") {
-            t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
-        }
-
-        if let Some(pitr_interval) = item.get("pitr_interval") {
-            t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
-        }
-        if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
-            t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
-                "walreceiver_connect_timeout",
-                walreceiver_connect_timeout,
-            )?);
-        }
-        if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
-            t_conf.lagging_wal_timeout = Some(parse_toml_duration(
-                "lagging_wal_timeout",
-                lagging_wal_timeout,
-            )?);
-        }
-        if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
-            t_conf.max_lsn_wal_lag =
-                Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
-        }
-        if let Some(trace_read_requests) = item.get("trace_read_requests") {
-            t_conf.trace_read_requests =
-                Some(trace_read_requests.as_bool().with_context(|| {
-                    "configure option trace_read_requests is not a bool".to_string()
-                })?);
-        }
-
-        if let Some(eviction_policy) = item.get("eviction_policy") {
-            t_conf.eviction_policy = Some(
-                deserialize_from_item("eviction_policy", eviction_policy)
-                    .context("parse eviction_policy")?,
-            );
-        }
-
-        if let Some(item) = item.get("min_resident_size_override") {
-            t_conf.min_resident_size_override = Some(
-                deserialize_from_item("min_resident_size_override", item)
-                    .context("parse min_resident_size_override")?,
-            );
-        }
-
-        if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
-            t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
-                "evictions_low_residence_duration_metric_threshold",
-                item,
-            )?);
-        }
-
-        if let Some(gc_feedback) = item.get("gc_feedback") {
-            t_conf.gc_feedback = Some(
-                gc_feedback
-                    .as_bool()
-                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
-            );
-        }
-
-        Ok(t_conf)
-    }
-
    #[cfg(test)]
    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
-        Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
+        let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
+        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
    }

    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -988,6 +918,7 @@ impl PageServerConf {
            control_plane_api: None,
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
+            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
        }
    }
 }
@@ -1212,7 +1143,8 @@ background_task_maximum_delay = '334 s'
                )?,
                control_plane_api: None,
                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_emergency_mode: false,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1269,7 +1201,8 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: Duration::from_secs(334),
                control_plane_api: None,
                control_plane_api_token: None,
-                control_plane_emergency_mode: false
+                control_plane_emergency_mode: false,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1314,12 +1247,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs: NonZeroUsize::new(
-                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
-                    )
-                        .unwrap(),
-                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
-                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1380,8 +1307,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs,
-                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
@@ -1425,6 +1350,37 @@ trace_read_requests = {trace_read_requests}"#,
        Ok(())
    }

+    #[test]
+    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
+        let config_string = r#"
+            [tenant_config]
+            checkpoint_distance = -1 # supposed to be an u64
+        "#
+        .to_string();
+
+        let toml: Document = config_string.parse()?;
+        let item = toml.get("tenant_config").unwrap();
+        let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
+
+        let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
+        assert_eq!(error.to_string(), expected_error_str);
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_override_tenant_config() -> anyhow::Result<()> {
+        let config_string = r#"tenant_config={ min_resident_size_override =  400 }"#.to_string();
+
+        let toml: Document = config_string.parse()?;
+        let item = toml.get("tenant_config").unwrap();
+        let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
+
+        assert_eq!(conf.min_resident_size_override, Some(400));
+
+        Ok(())
+    }
+
    #[test]
    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -12,6 +12,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;

@@ -37,6 +38,7 @@ type RawMetric = (MetricsKey, (EventType, u64));
 type Cache = HashMap<MetricsKey, (EventType, u64)>;

 /// Main thread that serves metrics collection
+#[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
@@ -44,6 +46,7 @@ pub async fn collect_metrics(
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
+    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
    if _cached_metric_collection_interval != Duration::ZERO {
@@ -63,9 +66,13 @@ pub async fn collect_metrics(
        "synthetic size calculation",
        false,
        async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
-                .instrument(info_span!("synthetic_size_worker"))
-                .await?;
+            calculate_synthetic_size_worker(
+                synthetic_size_calculation_interval,
+                &cancel,
+                &worker_ctx,
+            )
+            .instrument(info_span!("synthetic_size_worker"))
+            .await?;
            Ok(())
        },
    );
@@ -241,6 +248,7 @@ async fn reschedule(
 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
+    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
@@ -248,8 +256,6 @@ async fn calculate_synthetic_size_worker(
        info!("calculate_synthetic_size_worker stopped");
    };

-    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
-
    loop {
        let started_at = Instant::now();

@@ -261,21 +267,25 @@ async fn calculate_synthetic_size_worker(
            }
        };

-        for (tenant_id, tenant_state) in tenants {
+        for (tenant_shard_id, tenant_state) in tenants {
            if tenant_state != TenantState::Active {
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
-                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
-                // We can put in some prioritization for consumption metrics.
-                // Same for the loop that fetches computed metrics.
-                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
-                // which turns out is really handy to understand the system.
-                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
-                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
-                }
+            if !tenant_shard_id.is_zero() {
+                // We only send consumption metrics from shard 0, so don't waste time calculating
+                // synthetic size on other shards.
+                continue;
            }
+
+            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
+                continue;
+            };
+
+            // there is never any reason to exit calculate_synthetic_size_worker following any
+            // return value -- we don't need to care about shutdown because no tenant is found when
+            // pageserver is shut down.
+            calculate_and_log(&tenant, cancel, ctx).await;
        }

        crate::tenant::tasks::warn_when_period_overrun(
@@ -286,7 +296,7 @@ async fn calculate_synthetic_size_worker(

        let res = tokio::time::timeout_at(
            started_at + synthetic_size_calculation_interval,
-            task_mgr::shutdown_token().cancelled(),
+            cancel.cancelled(),
        )
        .await;
        if res.is_ok() {
@@ -294,3 +304,31 @@ async fn calculate_synthetic_size_worker(
        }
    }
 }
+
+async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+    const CAUSE: LogicalSizeCalculationCause =
+        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
+
+    // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+    // We can put in some prioritization for consumption metrics.
+    // Same for the loop that fetches computed metrics.
+    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+    // which turns out is really handy to understand the system.
+    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
+        return;
+    };
+
+    // this error can be returned if timeline is shutting down, but it does not
+    // mean the synthetic size worker should terminate. we do not need any checks
+    // in this function because `mgr::get_tenant` will error out after shutdown has
+    // progressed to shutting down tenants.
+    let shutting_down = matches!(
+        e.downcast_ref::<PageReconstructError>(),
+        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+    );
+
+    if !shutting_down {
+        let tenant_shard_id = tenant.tenant_shard_id();
+        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+    }
+}
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,5 +1,4 @@
-use crate::context::RequestContext;
-use anyhow::Context;
+use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
@@ -198,13 +197,12 @@ pub(super) async fn collect_all_metrics(
    };

    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
-        if state != TenantState::Active {
+        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
-                .await
                .ok()
-                .map(|tenant| (id, tenant))
+                .map(|tenant| (id.tenant_id, tenant))
        }
    });

@@ -352,14 +350,17 @@ impl TimelineSnapshot {
            let last_record_lsn = t.get_last_record_lsn();

            let current_exact_logical_size = {
-                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
+                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
+                let size = span.in_scope(|| {
+                    t.get_current_logical_size(
+                        crate::tenant::timeline::GetLogicalSizePriority::Background,
+                        ctx,
+                    )
+                });
+                match size {
                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
+                    CurrentLogicalSize::Exact(ref size) => Some(size.into()),
+                    CurrentLogicalSize::Approximate(_) => None,
                }
            };

--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,16 +1,15 @@
 use std::collections::HashMap;

-use pageserver_api::control_api::{
-    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+use pageserver_api::{
+    control_api::{
+        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+    },
+    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{
-    backoff,
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
+use utils::{backoff, generation::Generation, id::NodeId};

 use crate::config::PageServerConf;

@@ -31,11 +30,11 @@ pub enum RetryForeverError {

 #[async_trait::async_trait]
 pub trait ControlPlaneGenerationsApi {
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
    async fn validate(
        &self,
-        tenants: Vec<(TenantId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+        tenants: Vec<(TenantShardId, Generation)>,
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
 }

 impl ControlPlaneClient {
@@ -127,7 +126,7 @@ impl ControlPlaneClient {
 #[async_trait::async_trait]
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -154,8 +153,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
    async fn validate(
        &self,
-        tenants: Vec<(TenantId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        tenants: Vec<(TenantShardId, Generation)>,
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("validate")
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,11 +10,12 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use hex::FromHex;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
@@ -25,7 +26,7 @@ use tracing::Instrument;
 use tracing::{self, debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;
 use utils::lsn::AtomicLsn;
 use utils::lsn::Lsn;

@@ -159,11 +160,10 @@ pub struct DeletionQueueClient {
    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct TenantDeletionList {
    /// For each Timeline, a list of key fragments to append to the timeline remote path
    /// when reconstructing a full key
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
    timelines: HashMap<TimelineId, Vec<String>>,

    /// The generation in which this deletion was emitted: note that this may not be the
@@ -178,43 +178,11 @@ impl TenantDeletionList {
    }
 }

-/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
-fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-    V: Serialize,
-    I: AsRef<[u8]>,
-{
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
-
-    transformed
-        .collect::<HashMap<String, &V>>()
-        .serialize(serializer)
-}
-
-/// For HashMaps using a FromHex key, where we would like to decode the key
-fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-    V: Deserialize<'de>,
-    I: FromHex + std::hash::Hash + Eq,
-{
-    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
-    hex_map
-        .into_iter()
-        .map(|(k, v)| {
-            I::from_hex(k)
-                .map(|k| (k, v))
-                .map_err(|_| serde::de::Error::custom("Invalid hex ID"))
-        })
-        .collect()
-}
-
 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct DeletionList {
    /// Serialization version, for future use
    version: u8,
@@ -226,8 +194,7 @@ struct DeletionList {
    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
    /// with one unique generation ID: if someone tries to push a second generation
    /// ID for the same tenant, we will start a new DeletionList.
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
-    tenants: HashMap<TenantId, TenantDeletionList>,
+    tenants: HashMap<TenantShardId, TenantDeletionList>,

    /// Avoid having to walk `tenants` to calculate the number of keys in
    /// the nested deletion lists
@@ -299,7 +266,7 @@ impl DeletionList {
    /// deletion list.
    fn push(
        &mut self,
-        tenant: &TenantId,
+        tenant: &TenantShardId,
        timeline: &TimelineId,
        generation: Generation,
        objects: &mut Vec<RemotePath>,
@@ -391,7 +358,7 @@ struct TenantLsnState {

 #[derive(Default)]
 struct VisibleLsnUpdates {
-    tenants: HashMap<TenantId, TenantLsnState>,
+    tenants: HashMap<TenantShardId, TenantLsnState>,
 }

 impl VisibleLsnUpdates {
@@ -448,7 +415,7 @@ impl DeletionQueueClient {

    pub(crate) fn recover(
        &self,
-        attached_tenants: HashMap<TenantId, Generation>,
+        attached_tenants: HashMap<TenantShardId, Generation>,
    ) -> Result<(), DeletionQueueError> {
        self.do_push(
            &self.tx,
@@ -465,7 +432,7 @@ impl DeletionQueueClient {
    /// backend will later wake up and notice that the tenant's generation requires validation.
    pub(crate) async fn update_remote_consistent_lsn(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
        lsn: Lsn,
@@ -476,10 +443,13 @@ impl DeletionQueueClient {
            .write()
            .expect("Lock should never be poisoned");

-        let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
-            timelines: HashMap::new(),
-            generation: current_generation,
-        });
+        let tenant_entry = locked
+            .tenants
+            .entry(tenant_shard_id)
+            .or_insert(TenantLsnState {
+                timelines: HashMap::new(),
+                generation: current_generation,
+            });

        if tenant_entry.generation != current_generation {
            // Generation might have changed if we were detached and then re-attached: in this case,
@@ -506,27 +476,29 @@ impl DeletionQueueClient {
    /// generations in `layers` are the generations in which those layers were written.
    pub(crate) async fn push_layers(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        if current_generation.is_none() {
            debug!("Enqueuing deletions in legacy mode, skipping queue");
+
            let mut layer_paths = Vec::new();
-            for (layer, generation) in layers {
+            for (layer, meta) in layers {
                layer_paths.push(remote_layer_path(
-                    &tenant_id,
+                    &tenant_shard_id.tenant_id,
                    &timeline_id,
+                    meta.shard,
                    &layer,
-                    generation,
+                    meta.generation,
                ));
            }
            self.push_immediate(layer_paths).await?;
            return self.flush_immediate().await;
        }

-        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
+        self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
    }

    /// When a Tenant has a generation, push_layers is always synchronous because
@@ -536,10 +508,10 @@ impl DeletionQueueClient {
    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
    pub(crate) fn push_layers_sync(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
@@ -547,7 +519,7 @@ impl DeletionQueueClient {
        self.do_push(
            &self.tx,
            ListWriterQueueMessage::Delete(DeletionOp {
-                tenant_id,
+                tenant_shard_id,
                timeline_id,
                layers,
                generation: current_generation,
@@ -750,6 +722,7 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
+    use pageserver_api::shard::ShardIndex;
    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

@@ -814,12 +787,12 @@ mod test {
        }

        fn set_latest_generation(&self, gen: Generation) {
-            let tenant_id = self.harness.tenant_id;
+            let tenant_shard_id = self.harness.tenant_shard_id;
            self.mock_control_plane
                .latest_generation
                .lock()
                .unwrap()
-                .insert(tenant_id, gen);
+                .insert(tenant_shard_id, gen);
        }

        /// Returns remote layer file name, suitable for use in assert_remote_files
@@ -828,8 +801,8 @@ mod test {
            file_name: LayerFileName,
            gen: Generation,
        ) -> anyhow::Result<String> {
-            let tenant_id = self.harness.tenant_id;
-            let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+            let tenant_shard_id = self.harness.tenant_shard_id;
+            let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
            let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
            std::fs::create_dir_all(&remote_timeline_path)?;
            let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
@@ -847,7 +820,7 @@ mod test {

    #[derive(Debug, Clone)]
    struct MockControlPlane {
-        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
+        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
    }

    impl MockControlPlane {
@@ -861,20 +834,20 @@ mod test {
    #[async_trait::async_trait]
    impl ControlPlaneGenerationsApi for MockControlPlane {
        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
-        async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
        async fn validate(
            &self,
-            tenants: Vec<(TenantId, Generation)>,
-        ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+            tenants: Vec<(TenantShardId, Generation)>,
+        ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
            let mut result = HashMap::new();

            let latest_generation = self.latest_generation.lock().unwrap();

-            for (tenant_id, generation) in tenants {
-                if let Some(latest) = latest_generation.get(&tenant_id) {
-                    result.insert(tenant_id, *latest == generation);
+            for (tenant_shard_id, generation) in tenants {
+                if let Some(latest) = latest_generation.get(&tenant_shard_id) {
+                    result.insert(tenant_shard_id, *latest == generation);
                }
            }

@@ -893,14 +866,6 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
@@ -986,10 +951,10 @@ mod test {
        client.recover(HashMap::new())?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let tenant_id = ctx.harness.tenant_id;
+        let tenant_shard_id = ctx.harness.tenant_shard_id;

        let content: Vec<u8> = "victim1 contents".into();
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
        let deletion_prefix = ctx.harness.conf.deletion_prefix();

@@ -997,6 +962,8 @@ mod test {
        // we delete, and the generation of the running Tenant.
        let layer_generation = Generation::new(0xdeadbeef);
        let now_generation = Generation::new(0xfeedbeef);
+        let layer_metadata =
+            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());

        let remote_layer_file_name_1 =
            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
@@ -1017,10 +984,10 @@ mod test {
        info!("Pushing");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation,
-                [(layer_file_name_1.clone(), layer_generation)].to_vec(),
+                [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
            )
            .await?;
        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
@@ -1059,11 +1026,13 @@ mod test {
        let stale_generation = latest_generation.previous();
        // Generation that our example layer file was written with
        let layer_generation = stale_generation.previous();
+        let layer_metadata =
+            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());

        ctx.set_latest_generation(latest_generation);

-        let tenant_id = ctx.harness.tenant_id;
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let tenant_shard_id = ctx.harness.tenant_shard_id;
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());

        // Initial state: a remote layer exists
@@ -1073,10 +1042,10 @@ mod test {
        tracing::debug!("Pushing...");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                stale_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1088,10 +1057,10 @@ mod test {
        tracing::debug!("Pushing...");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                latest_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1110,14 +1079,16 @@ mod test {
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

-        let tenant_id = ctx.harness.tenant_id;
+        let tenant_shard_id = ctx.harness.tenant_shard_id;

-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
        let deletion_prefix = ctx.harness.conf.deletion_prefix();

        let layer_generation = Generation::new(0xdeadbeef);
        let now_generation = Generation::new(0xfeedbeef);
+        let layer_metadata =
+            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());

        // Inject a deletion in the generation before generation_now: after restart,
        // this deletion should _not_ get executed (only the immediately previous
@@ -1126,10 +1097,10 @@ mod test {
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation.previous(),
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1140,10 +1111,10 @@ mod test {
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation,
-                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1171,7 +1142,7 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::from([(tenant_id, now_generation)]))?;
+        client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1233,12 +1204,13 @@ pub(crate) mod mock {
                match msg {
                    ListWriterQueueMessage::Delete(op) => {
                        let mut objects = op.objects;
-                        for (layer, generation) in op.layers {
+                        for (layer, meta) in op.layers {
                            objects.push(remote_layer_path(
-                                &op.tenant_id,
+                                &op.tenant_shard_id.tenant_id,
                                &op.timeline_id,
+                                meta.shard,
                                &layer,
-                                generation,
+                                meta.generation,
                            ));
                        }

@@ -1318,4 +1290,34 @@ pub(crate) mod mock {
            }
        }
    }
+
+    /// Test round-trip serialization/deserialization, and test stability of the format
+    /// vs. a static expected string for the serialized version.
+    #[test]
+    fn deletion_list_serialization() -> anyhow::Result<()> {
+        let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
+            .to_string()
+            .parse::<TenantShardId>()?;
+        let timeline_id = "be322c834ed9e709e63b5c9698691910"
+            .to_string()
+            .parse::<TimelineId>()?;
+        let generation = Generation::new(123);
+
+        let object =
+            RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
+        let mut objects = [object].to_vec();
+
+        let mut example = DeletionList::new(1);
+        example.push(&tenant_id, &timeline_id, generation, &mut objects);
+
+        let encoded = serde_json::to_string(&example)?;
+
+        let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
+        assert_eq!(encoded, expected);
+
+        let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
 }
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,21 +55,24 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            || async {
+                fail::fail_point!("deletion-queue-before-execute", |_| {
+                    info!("Skipping execution, failpoint set");
+
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
+                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
+                });
+
+                self.remote_storage.delete_objects(&self.accumulator).await
+            },
            |_| false,
            3,
            10,
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -19,6 +19,7 @@ use std::collections::HashMap;
 use std::fs::create_dir_all;
 use std::time::Duration;

+use pageserver_api::shard::TenantShardId;
 use regex::Regex;
 use remote_storage::RemotePath;
 use tokio_util::sync::CancellationToken;
@@ -26,13 +27,13 @@ use tracing::debug;
 use tracing::info;
 use tracing::warn;
 use utils::generation::Generation;
-use utils::id::TenantId;
 use utils::id::TimelineId;

 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
@@ -53,22 +54,22 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);

 #[derive(Debug)]
 pub(super) struct DeletionOp {
-    pub(super) tenant_id: TenantId,
+    pub(super) tenant_shard_id: TenantShardId,
    pub(super) timeline_id: TimelineId,
    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
    // have a config object handy to project it to a remote key, and need the consuming worker
    // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
    pub(super) objects: Vec<RemotePath>,

-    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
    /// this deletion.
    pub(super) generation: Generation,
 }

 #[derive(Debug)]
 pub(super) struct RecoverOp {
-    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+    pub(super) attached_tenants: HashMap<TenantShardId, Generation>,
 }

 #[derive(Debug)]
@@ -205,7 +206,7 @@ impl ListWriter {

    async fn recover(
        &mut self,
-        attached_tenants: HashMap<TenantId, Generation>,
+        attached_tenants: HashMap<TenantShardId, Generation>,
    ) -> Result<(), anyhow::Error> {
        debug!(
            "recovering with {} attached tenants",
@@ -308,10 +309,21 @@ impl ListWriter {
                // generation was issued to another node in the interval while we restarted,
                // then we may treat deletion lists from the previous generation as if they
                // belong to our currently attached generation, and proceed to validate & execute.
-                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
-                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
                        if attached_gen.previous() == tenant_list.generation {
+                            info!(
+                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
+                                shard_id=%tenant_shard_id.shard_slug(),
+                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
+                                "Updating gen on recovered list");
                            tenant_list.generation = *attached_gen;
+                        } else {
+                            info!(
+                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
+                                shard_id=%tenant_shard_id.shard_slug(),
+                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
+                                "Encountered stale generation on recovered list");
                        }
                    }
                }
@@ -387,25 +399,26 @@ impl ListWriter {
                    );

                    let mut layer_paths = Vec::new();
-                    for (layer, generation) in op.layers {
+                    for (layer, meta) in op.layers {
                        layer_paths.push(remote_layer_path(
-                            &op.tenant_id,
+                            &op.tenant_shard_id.tenant_id,
                            &op.timeline_id,
+                            meta.shard,
                            &layer,
-                            generation,
+                            meta.generation,
                        ));
                    }
                    layer_paths.extend(op.objects);

                    if !self.pending.push(
-                        &op.tenant_id,
+                        &op.tenant_shard_id,
                        &op.timeline_id,
                        op.generation,
                        &mut layer_paths,
                    ) {
                        self.flush().await;
                        let retry_succeeded = self.pending.push(
-                            &op.tenant_id,
+                            &op.tenant_shard_id,
                            &op.timeline_id,
                            op.generation,
                            &mut layer_paths,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -178,7 +178,14 @@ where
                .unwrap_or(false);

            if valid && *validated_generation == tenant_lsn_state.generation {
-                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                for (timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                    tracing::debug!(
+                        %tenant_id,
+                        %timeline_id,
+                        current = %pending_lsn.result_slot.load(),
+                        projected = %pending_lsn.projected,
+                        "advancing validated remote_consistent_lsn",
+                    );
                    pending_lsn.result_slot.store(pending_lsn.projected);
                }
            } else {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -42,7 +42,6 @@
 //   reading these fields. We use the Debug impl for semi-structured logging, though.

 use std::{
-    collections::HashMap,
    sync::Arc,
    time::{Duration, SystemTime},
 };
@@ -125,7 +124,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -149,8 +148,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();

        async {
-            let res =
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;

            match res {
                Ok(()) => {}
@@ -181,12 +186,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -268,8 +274,9 @@ struct LayerCount {
    count: usize,
 }

-pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
+    _storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -310,7 +317,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                .unwrap()
                .as_micros(),
            partition,
-            desc.tenant_id,
+            desc.tenant_shard_id,
            desc.timeline_id,
            candidate.layer,
        );
@@ -321,16 +328,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Walk through the list of candidates, until we have accumulated enough layers to get
    // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
    // how much disk space would be used after evicting all the layers up to the current
-    // point in the list. The layers are collected in 'batched', grouped per timeline.
+    // point in the list.
    //
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
-    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
+    let mut evicted_amount = 0;
+
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
                no_candidates_evicted = i,
@@ -339,25 +346,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            break;
        }

-        if partition == MinResidentSizePartition::Below && warned.is_none() {
+        if partition == &MinResidentSizePartition::Below && warned.is_none() {
            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
            warned = Some(usage_planned);
        }

        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
-
-        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
-        // tasks to evict all seen layers until we have evicted enough
-
-        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
-
-        // semaphore will later be used to limit eviction concurrency, and we can express at
-        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
-        // but fail gracefully by not making batches larger.
-        if batch.len() < u32::MAX as usize {
-            batch.push(candidate.layer);
-            max_batch_size = max_batch_size.max(batch.len());
-        }
+        evicted_amount += 1;
    }

    let usage_planned = match warned {
@@ -372,100 +367,79 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    };
    debug!(?usage_planned, "usage planned");

-    // phase2: evict victims batched by timeline
+    // phase2: evict layers

    let mut js = tokio::task::JoinSet::new();
+    let limit = 1000;

-    // ratelimit to 1k files or any higher max batch size
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+    let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
+    let mut consumed_all = false;

-    for (timeline, batch) in batched {
-        let tenant_id = timeline.tenant_id;
-        let timeline_id = timeline.timeline_id;
-        let batch_size =
-            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
+    // After the evictions, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();

-        // I dislike naming of `available_permits` but it means current total amount of permits
-        // because permits can be added
-        assert!(batch_size as usize <= limit.available_permits());
+    let evict_layers = async move {
+        loop {
+            let next = if js.len() >= limit || consumed_all {
+                js.join_next().await
+            } else if !js.is_empty() {
+                // opportunistically consume ready result, one per each new evicted
+                futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
+            } else {
+                None
+            };

-        debug!(%timeline_id, "evicting batch for timeline");
-
-        let evict = {
-            let limit = limit.clone();
-            let cancel = cancel.clone();
-            async move {
-                let mut evicted_bytes = 0;
-                let mut evictions_failed = LayerCount::default();
-
-                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
-                    // semaphore closing means cancelled
-                    return (evicted_bytes, evictions_failed);
-                };
-
-                let results = timeline.evict_layers(&batch, &cancel).await;
-
-                match results {
-                    Ok(results) => {
-                        assert_eq!(results.len(), batch.len());
-                        for (result, layer) in results.into_iter().zip(batch.iter()) {
-                            let file_size = layer.layer_desc().file_size;
-                            match result {
-                                Some(Ok(())) => {
-                                    evicted_bytes += file_size;
-                                }
-                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                                    evictions_failed.file_sizes += file_size;
-                                    evictions_failed.count += 1;
-                                }
-                                None => {
-                                    assert!(cancel.is_cancelled());
-                                }
-                            }
-                        }
+            if let Some(next) = next {
+                match next {
+                    Ok(Ok(file_size)) => {
+                        usage_assumed.add_available_bytes(file_size);
                    }
-                    Err(e) => {
-                        warn!("failed to evict batch: {:#}", e);
+                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
+                        evictions_failed.file_sizes += file_size;
+                        evictions_failed.count += 1;
                    }
+                    Err(je) if je.is_cancelled() => unreachable!("not used"),
+                    Err(je) if je.is_panic() => { /* already logged */ }
+                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
                }
-                (evicted_bytes, evictions_failed)
            }
-        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));

-        js.spawn(evict);
-
-        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
-        // chance of making progress
-        tokio::task::yield_now().await;
-    }
-
-    let join_all = async move {
-        // After the evictions, `usage_assumed` is the post-eviction usage,
-        // according to internal accounting.
-        let mut usage_assumed = usage_pre;
-        let mut evictions_failed = LayerCount::default();
-
-        while let Some(res) = js.join_next().await {
-            match res {
-                Ok((evicted_bytes, failed)) => {
-                    usage_assumed.add_available_bytes(evicted_bytes);
-                    evictions_failed.file_sizes += failed.file_sizes;
-                    evictions_failed.count += failed.count;
-                }
-                Err(je) if je.is_cancelled() => unreachable!("not used"),
-                Err(je) if je.is_panic() => { /* already logged */ }
-                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
+            if consumed_all && js.is_empty() {
+                break;
            }
+
+            // calling again when consumed_all is fine as evicted is fused.
+            let Some((_partition, candidate)) = evicted.next() else {
+                consumed_all = true;
+                continue;
+            };
+
+            js.spawn(async move {
+                let rtc = candidate.timeline.remote_client.as_ref().expect(
+                    "holding the witness, all timelines must have a remote timeline client",
+                );
+                let file_size = candidate.layer.layer_desc().file_size;
+                candidate
+                    .layer
+                    .evict_and_wait(rtc)
+                    .await
+                    .map(|()| file_size)
+                    .map_err(|e| (file_size, e))
+            });
+
+            tokio::task::yield_now().await;
        }
+
        (usage_assumed, evictions_failed)
    };

    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
+        tuple = evict_layers => { tuple },
        _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
-            limit.close();
+            // dropping joinset will abort all pending evict_and_waits and that is fine, our
+            // requests will still stand
            return Ok(IterationOutcome::Cancelled);
        }
    };
@@ -545,7 +519,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -554,6 +528,11 @@ async fn collect_eviction_candidates(
            }
        };

+        if tenant.cancel.is_cancelled() {
+            info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
+            continue;
+        }
+
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
@@ -567,7 +546,7 @@ async fn collect_eviction_candidates(
                continue;
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
-            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
                    .into_iter()
--- a/Show More
+++ b/Show More