Retroactively fix the nextXid on a known broken timeline (v3)

This one particular timeline in production hit the nextXid bug. Add a one-off hack that will fix the nextXid on that particular timeline.
Patch safekeeper control file on HTTP request (#6455 )
2026-07-13 09:00:37 +00:00 · 2024-01-30 11:01:33 +02:00 · 2024-01-29 18:20:57 +00:00 · 2024-01-29 17:38:03 +00:00 · 2024-01-29 17:59:26 +01:00 · 2024-01-29 16:16:37 +00:00
124 changed files with 5666 additions and 1857 deletions
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -69,7 +69,15 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64

  kaniko-arm:
    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -85,7 +93,15 @@ jobs:
        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json

      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64

  manifest:
    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -99,7 +115,10 @@ jobs:

    steps:
      - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64

      - name: Push manifest
        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,6 +21,8 @@ env:
  COPT: '-Werror'
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

 jobs:
  check-permissions:
@@ -44,6 +46,20 @@ jobs:

        exit 1

+  cancel-previous-e2e-tests:
+    needs: [ check-permissions ]
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Cancel previous e2e-tests runs for this PR
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          gh workflow --repo neondatabase/cloud \
+            run cancel-previous-in-concurrency-group.yml \
+              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
+
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
@@ -186,7 +202,11 @@ jobs:
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      options: --init
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
      fail-fast: false
      matrix:
@@ -340,8 +360,12 @@ jobs:
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
        run: |
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -419,8 +443,8 @@ jobs:
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
      fail-fast: false
      matrix:
@@ -448,6 +472,7 @@ jobs:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -458,12 +483,13 @@ jobs:
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
    strategy:
      fail-fast: false
      matrix:
+        # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
        pytest_split_group: [ 1, 2, 3, 4 ]
        build_type: [ release ]
    steps:
@@ -477,11 +503,12 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
+          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -695,7 +722,8 @@ jobs:
                \"commit_hash\": \"$COMMIT_SHA\",
                \"remote_repo\": \"${{ github.repository }}\",
                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
              }
            }"

--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -124,12 +124,12 @@ jobs:
      # Hence keeping target/ (and general cache size) smaller
      BUILD_TYPE: release
      CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --locked --release
+      CARGO_FLAGS: --release
      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}

    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
      options: --init

    steps:
@@ -210,18 +210,20 @@ jobs:

      - name: Run cargo build
        run: |
-          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      - name: Run cargo test
+        env:
+          NEXTEST_RETRIES: 3
        run: |
-          cargo test $CARGO_FLAGS $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -231,7 +233,7 @@ jobs:
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure

  check-codestyle-rust-arm:
    timeout-minutes: 90
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -20,111 +20,51 @@ defaults:
  run:
    shell: bash -euo pipefail {0}

-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
 permissions: {}

 jobs:
  tag-image:
    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye

    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
      FROM_TAG: ${{ inputs.from-tag }}
      TO_TAG: ${{ inputs.to-tag }}

    steps:
-      - name: Install Crane & ECR helper
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV

-      - name: Configure ECR login
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v2
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install crane
        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0

-      - name: Restore previous tag if needed
+      - name: Copy images
        run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"

-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -278,6 +278,7 @@ dependencies = [
 "camino",
 "clap",
 "control_plane",
+ "diesel",
 "futures",
 "git-version",
 "hyper",
@@ -286,7 +287,6 @@ dependencies = [
 "pageserver_client",
 "postgres_backend",
 "postgres_connection",
- "scopeguard",
 "serde",
 "serde_json",
 "thiserror",
@@ -1328,6 +1328,8 @@ dependencies = [
 "clap",
 "comfy-table",
 "compute_api",
+ "diesel",
+ "diesel_migrations",
 "futures",
 "git-version",
 "hex",
@@ -1342,6 +1344,7 @@ dependencies = [
 "regex",
 "reqwest",
 "safekeeper_api",
+ "scopeguard",
 "serde",
 "serde_json",
 "serde_with",
@@ -1637,6 +1640,52 @@ dependencies = [
 "rusticata-macros",
 ]

+[[package]]
+name = "diesel"
+version = "2.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
+dependencies = [
+ "bitflags 2.4.1",
+ "byteorder",
+ "diesel_derives",
+ "itoa",
+ "pq-sys",
+ "serde_json",
+]
+
+[[package]]
+name = "diesel_derives"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
+dependencies = [
+ "diesel_table_macro_syntax",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.32",
+]
+
+[[package]]
+name = "diesel_migrations"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
+dependencies = [
+ "diesel",
+ "migrations_internals",
+ "migrations_macros",
+]
+
+[[package]]
+name = "diesel_table_macro_syntax"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
+dependencies = [
+ "syn 2.0.32",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -2563,6 +2612,16 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "io-uring"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762"
+dependencies = [
+ "bitflags 1.3.2",
+ "libc",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.9.0"
@@ -2677,6 +2736,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
@@ -2773,9 +2838,33 @@ dependencies = [
 "libc",
 "once_cell",
 "prometheus",
+ "rand 0.8.5",
+ "rand_distr",
+ "twox-hash",
 "workspace_hack",
 ]

+[[package]]
+name = "migrations_internals"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
+dependencies = [
+ "serde",
+ "toml",
+]
+
+[[package]]
+name = "migrations_macros"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
+dependencies = [
+ "migrations_internals",
+ "proc-macro2",
+ "quote",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -2977,6 +3066,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
 "autocfg",
+ "libm",
 ]

 [[package]]
@@ -3361,6 +3451,7 @@ dependencies = [
 "tenant_size_model",
 "thiserror",
 "tokio",
+ "tokio-epoll-uring",
 "tokio-io-timeout",
 "tokio-postgres",
 "tokio-stream",
@@ -3783,6 +3874,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"

+[[package]]
+name = "pq-sys"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4081,6 +4181,16 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -5113,9 +5223,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"

 [[package]]
 name = "smol_str"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
+checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
 dependencies = [
 "serde",
 ]
@@ -5382,18 +5492,18 @@ dependencies = [

 [[package]]
 name = "thiserror"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
+checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
 dependencies = [
 "thiserror-impl",
 ]

 [[package]]
 name = "thiserror-impl"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
+checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -5517,6 +5627,21 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "tokio-epoll-uring"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "futures",
+ "once_cell",
+ "scopeguard",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "uring-common",
+]
+
 [[package]]
 name = "tokio-io-timeout"
 version = "1.2.0"
@@ -6026,6 +6151,15 @@ dependencies = [
 "webpki-roots 0.23.1",
 ]

+[[package]]
+name = "uring-common"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "io-uring",
+ "libc",
+]
+
 [[package]]
 name = "url"
 version = "2.3.1"
@@ -6587,6 +6721,7 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "diesel",
 "either",
 "fail",
 "futures-channel",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -151,6 +151,7 @@ test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
@@ -164,6 +165,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
--- a/2
+++ b/2
@@ -53,6 +53,7 @@ RUN set -e \
      --bin pagectl  \
      --bin safekeeper  \
      --bin storage_broker  \
+      --bin attachment_service  \
      --bin proxy  \
      --bin neon_local \
      --locked --release \
@@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -52,7 +52,7 @@ RUN cd postgres && \
    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
    # In vanilla postgres this function is limited to Postgres role superuser.
    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
    # so we do it here.
    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
        fi; \
    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7, 
+    # the second loop is for pg_stat_statement extension versions >= 1.7,
    # where pg_stat_statement_reset() got 3 additional arguments
    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
        filename=$(basename "$file"); \
        if ! echo "$old_list" | grep -q -F "$filename"; then \
            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
        fi; \
-    done      
+    done

 #########################################################################################
 #
@@ -144,30 +144,23 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-ARG PG_VERSION
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLV8_VERSION=3.1.5 \
-        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
-        ;; \
-      "v16") \
-        export PLV8_VERSION=3.1.8 \
-        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
-    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
+    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
+    # generate and copy upgrade scripts
+    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
+    cp upgrade/* /usr/local/pgsql/share/extension/ && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
+    # don't break computes with installed old version of plv8
+    cd /usr/local/pgsql/lib/ && \
+    ln -s plv8-3.1.10.so plv8-3.1.5.so && \
+    ln -s plv8-3.1.10.so plv8-3.1.8.so && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -527,8 +520,7 @@ RUN apt-get update && \
        libboost-regex1.74-dev \
        libboost-serialization1.74-dev \
        libboost-system1.74-dev \
-        libeigen3-dev \
-        libfreetype6-dev
+        libeigen3-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -553,6 +545,8 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
        -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
        -D RDK_INSTALL_INTREE=OFF \
+        -D RDK_INSTALL_COMIC_FONTS=OFF \
+        -D RDK_BUILD_FREETYPE_SUPPORT=OFF \
        -D CMAKE_BUILD_TYPE=Release \
        . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -907,7 +901,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
-# libboost*, libfreetype6, and zlib1g for rdkit
+# libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
@@ -920,7 +914,6 @@ RUN apt update &&  \
        libboost-serialization1.74.0 \
        libboost-system1.74.0 \
        libossp-uuid16 \
-        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -932,7 +925,6 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g \
        ca-certificates && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,6 +10,8 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+diesel = { version = "2.1.4", features = ["postgres"]}
+diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
@@ -19,6 +21,7 @@ hex.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
+scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -14,7 +14,6 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
@@ -26,6 +25,8 @@ tracing.workspace = true
 # a parsing function when loading pageservers from neon_local LocalEnv
 postgres_backend.workspace = true

+diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
 control_plane = { path = ".." }
--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+    EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+                    FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+    IF (
+        NEW IS DISTINCT FROM OLD AND
+        NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+    ) THEN
+        NEW.updated_at := current_timestamp;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
@@ -0,0 +1 @@
+DROP TABLE tenant_shards;
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -0,0 +1,12 @@
+CREATE TABLE tenant_shards (
+  tenant_id VARCHAR NOT NULL,
+  shard_number INTEGER NOT NULL,
+  shard_count INTEGER NOT NULL,
+  PRIMARY KEY(tenant_id, shard_number, shard_count),
+  shard_stripe_size INTEGER NOT NULL,
+  generation INTEGER NOT NULL,
+  generation_pageserver BIGINT NOT NULL,
+  placement_policy VARCHAR NOT NULL,
+  -- config is JSON encoded, opaque to the database.
+  config TEXT NOT NULL
+);
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
@@ -0,0 +1 @@
+DROP TABLE nodes;
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
@@ -0,0 +1,10 @@
+CREATE TABLE nodes (
+  node_id BIGINT PRIMARY KEY NOT NULL,
+
+  scheduling_policy VARCHAR NOT NULL,
+
+  listen_http_addr VARCHAR NOT NULL,
+  listen_http_port INTEGER NOT NULL,
+  listen_pg_addr VARCHAR NOT NULL,
+  listen_pg_port INTEGER NOT NULL
+);
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,5 @@
 use crate::reconciler::ReconcileError;
-use crate::service::Service;
+use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
@@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, state.service.inspect(inspect_req))
 }

-async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state.service.tenant_create(create_req).await?,
-    )
+    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }

-async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_timeline_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
-
-    let state = get_state(&req);
    json_response(
        StatusCode::OK,
-        state
-            .service
+        service
            .tenant_timeline_create(tenant_id, create_req)
            .await?,
    )
 }

-async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_locate(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -154,14 +154,15 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }

-async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_shard_migrate(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
-    let state = get_state(&req);
    json_response(
        StatusCode::OK,
-        state
-            .service
+        service
            .tenant_shard_migrate(tenant_shard_id, migrate_req)
            .await?,
    )
@@ -178,6 +179,35 @@ impl From<ReconcileError> for ApiError {
    }
 }

+/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
+/// be allowed to run if Service has finished its initial reconciliation.
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
+{
+    let state = get_state(&request);
+    let service = state.service.clone();
+
+    let startup_complete = service.startup_complete.clone();
+    if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
+        .await
+        .is_err()
+    {
+        // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
+        // timeouts around its remote calls, to bound its runtime.
+        return Err(ApiError::Timeout(
+            "Timed out waiting for service readiness".into(),
+        ));
+    }
+
+    request_span(
+        request,
+        |request| async move { handler(service, request).await },
+    )
+    .await
+}
+
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
@@ -205,14 +235,20 @@ pub fn make_router(
        .put("/node/:node_id/config", |r| {
            request_span(r, handle_node_configure)
        })
-        .post("/tenant", |r| request_span(r, handle_tenant_create))
-        .post("/tenant/:tenant_id/timeline", |r| {
-            request_span(r, handle_tenant_timeline_create)
+        .post("/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_create)
+        })
+        .post("/v1/tenant/:tenant_id/timeline", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_create)
        })
        .get("/tenant/:tenant_id/locate", |r| {
-            request_span(r, handle_tenant_locate)
+            tenant_service_handler(r, handle_tenant_locate)
        })
        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            request_span(r, handle_tenant_shard_migrate)
+            tenant_service_handler(r, handle_tenant_shard_migrate)
        })
+        // Path aliases for tests_forward_compatibility
+        // TODO: remove these in future PR
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -7,6 +7,7 @@ mod node;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
+mod schema;
 pub mod service;
 mod tenant_state;

--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
+use tokio::signal::unix::SignalKind;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};

 use utils::{project_build_tag, project_git_version, tcp_listener};

@@ -40,6 +40,10 @@ struct Cli {
    /// Path to the .json file to store state (will be created if it doesn't exist)
    #[arg(short, long)]
    path: Utf8PathBuf,
+
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
+    #[arg(long)]
+    database_url: String,
 }

 #[tokio::main]
@@ -66,9 +70,14 @@ async fn main() -> anyhow::Result<()> {
        jwt_token: args.jwt_token,
    };

-    let persistence = Arc::new(Persistence::spawn(&args.path).await);
+    let json_path = if args.path.as_os_str().is_empty() {
+        None
+    } else {
+        Some(args.path)
+    };
+    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));

-    let service = Service::spawn(config, persistence).await?;
+    let service = Service::spawn(config, persistence.clone()).await?;

    let http_listener = tcp_listener::bind(args.listen)?;

@@ -81,20 +90,31 @@ async fn main() -> anyhow::Result<()> {
    let router = make_router(service, auth)
        .build()
        .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+    let router_service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);

    tracing::info!("Serving on {0}", args.listen);

    tokio::task::spawn(server);

-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
+    // Wait until we receive a signal
+    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
+    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
+    let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
+    tokio::select! {
+        _ = sigint.recv() => {},
+        _ = sigterm.recv() => {},
+        _ = sigquit.recv() => {},
+    }
+    tracing::info!("Terminating on signal");

-    Ok(())
+    if json_path.is_some() {
+        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
+        // full postgres dumps around.
+        if let Err(e) = persistence.write_tenants_json().await {
+            tracing::error!("Failed to write JSON on shutdown: {e}")
+        }
+    }
+
+    std::process::exit(0);
 }
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,8 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use utils::id::NodeId;

+use crate::persistence::NodePersistence;
+
 #[derive(Clone)]
 pub(crate) struct Node {
    pub(crate) id: NodeId,
@@ -34,4 +36,15 @@ impl Node {
            NodeSchedulingPolicy::Pause => false,
        }
    }
+
+    pub(crate) fn to_persistent(&self) -> NodePersistence {
+        NodePersistence {
+            node_id: self.id.0 as i64,
+            scheduling_policy: self.scheduling.into(),
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port as i32,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port as i32,
+        }
+    }
 }
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,182 +1,161 @@
-use std::{collections::HashMap, str::FromStr};
+use std::collections::HashMap;
+use std::str::FromStr;

-use camino::{Utf8Path, Utf8PathBuf};
-use control_plane::{
-    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
-    local_env::LocalEnv,
-};
-use pageserver_api::{
-    models::TenantConfig,
-    shard::{ShardCount, ShardNumber, TenantShardId},
-};
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use diesel::pg::PgConnection;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
-use tracing::info;
-use utils::{
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
+use utils::generation::Generation;
+use utils::id::{NodeId, TenantId};

-use crate::{node::Node, PlacementPolicy};
+use crate::node::Node;
+use crate::PlacementPolicy;

-/// Placeholder for storage.  This will be replaced with a database client.
+/// ## What do we store?
+///
+/// The attachment service does not store most of its state durably.
+///
+/// The essential things to store durably are:
+/// - generation numbers, as these must always advance monotonically to ensure data safety.
+/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
+/// - Node's scheduling policies, as the source of truth for these is something external.
+///
+/// Other things we store durably as an implementation detail:
+/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
+///   but it is operationally simpler to make this service the authority for which nodes
+///   it talks to.
+///
+/// ## Performance/efficiency
+///
+/// The attachment service does not go via the database for most things: there are
+/// a couple of places where we must, and where efficiency matters:
+/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
+///   before it can attach a tenant, so this acts as a bound on how fast things like
+///   failover can happen.
+/// - Pageserver re-attach: we will increment many shards' generations when this happens,
+///   so it is important to avoid e.g. issuing O(N) queries.
+///
+/// Database calls relating to nodes have low performance requirements, as they are very rarely
+/// updated, and reads of nodes are always from memory, not the database.  We only require that
+/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    inner: std::sync::Mutex<Inner>,
-}
-
-struct Inner {
-    state: PersistentState,
-    write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
+    database_url: String,
+
+    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
+    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
+    // compatible just yet.
+    json_path: Option<Utf8PathBuf>,
 }

+/// Legacy format, for use in JSON compat objects in test environment
 #[derive(Serialize, Deserialize)]
-struct PersistentState {
+struct JsonPersistence {
    tenants: HashMap<TenantShardId, TenantShardPersistence>,
 }

-struct PendingWrite {
-    bytes: Vec<u8>,
-    done_tx: tokio::sync::oneshot::Sender<()>,
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum DatabaseError {
+    #[error(transparent)]
+    Query(#[from] diesel::result::Error),
+    #[error(transparent)]
+    Connection(#[from] diesel::result::ConnectionError),
+    #[error("Logical error: {0}")]
+    Logical(String),
 }

-impl PersistentState {
-    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = format!("{}", tenant_id);
-                tenant.config = serde_json::to_string(&TenantConfig::default())?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
-            }
-        }
-
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Utf8Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path);
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path);
-                Self {
-                    tenants: HashMap::new(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
-            }
-        }
-    }
-}
+pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

 impl Persistence {
-    pub async fn spawn(path: &Utf8Path) -> Self {
-        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
-        let state = PersistentState::load_or_new(path).await;
-        tokio::spawn(Self::writer_task(rx, path.to_owned()));
+    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
        Self {
-            inner: std::sync::Mutex::new(Inner {
-                state,
-                write_queue_tx: tx,
-            }),
+            database_url,
+            json_path,
        }
    }

-    async fn writer_task(
-        mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
-        path: Utf8PathBuf,
-    ) {
-        scopeguard::defer! {
-            info!("persistence writer task exiting");
-        };
-        loop {
-            match rx.recv().await {
-                Some(write) => {
-                    tokio::task::spawn_blocking({
-                        let path = path.clone();
-                        move || {
-                            let tmp_path =
-                                utils::crashsafe::path_with_suffix_extension(&path, "___new");
-                            utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
-                        }
-                    })
-                    .await
-                    .expect("spawn_blocking")
-                    .expect("write file");
-                    let _ = write.done_tx.send(()); // receiver may lose interest any time
-                }
-                None => {
-                    return;
-                }
-            }
-        }
-    }
-
-    /// Perform a modification on our [`PersistentState`].
-    /// Return a future that completes once our modification has been persisted.
-    /// The output of the future is the return value of the `txn`` closure.
-    async fn mutating_transaction<F, R>(&self, txn: F) -> R
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PersistentState) -> R,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
    {
-        let (ret, done_rx) = {
-            let mut inner = self.inner.lock().unwrap();
-            let ret = txn(&mut inner.state);
-            let (done_tx, done_rx) = tokio::sync::oneshot::channel();
-            let write = PendingWrite {
-                bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
-                done_tx,
-            };
-            inner
-                .write_queue_tx
-                .send(write)
-                .expect("writer task always outlives self");
-            (ret, done_rx)
-        };
-        // the write task can go away once we start .await'ing
-        let _: () = done_rx.await.expect("writer task dead, check logs");
-        ret
+        let database_url = self.database_url.clone();
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            // TODO: connection pooling, such as via diesel::r2d2
+            let mut conn = PgConnection::establish(&database_url)?;
+            func(&mut conn)
+        })
+        .await
+        .expect("Task panic")
    }

-    /// When registering a node, persist it so that on next start we will be able to
-    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
-    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
-        // TODO: node persitence will come with database backend
-        Ok(())
+    /// When a node is first registered, persist it before using it for anything
+    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
+        let np = node.to_persistent();
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
+        .await
    }

-    /// At startup, we populate the service's list of nodes, and use this list to call into
-    /// each node to do an initial reconciliation of the state of the world with our in-memory
-    /// observed state.
-    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
-        let env = LocalEnv::load_config()?;
-        // TODO: node persitence will come with database backend
+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
+        let nodes: Vec<Node> = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table
+                    .load::<NodePersistence>(conn)?
+                    .into_iter()
+                    .map(|n| Node {
+                        id: NodeId(n.node_id as u64),
+                        // At startup we consider a node offline until proven otherwise.
+                        availability: NodeAvailability::Offline,
+                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                            .expect("Bad scheduling policy in DB"),
+                        listen_http_addr: n.listen_http_addr,
+                        listen_http_port: n.listen_http_port as u16,
+                        listen_pg_addr: n.listen_pg_addr,
+                        listen_pg_port: n.listen_pg_port as u16,
+                    })
+                    .collect::<Vec<Node>>())
+            })
+            .await?;

-        // XXX hack: enable test_backward_compatibility to work by populating our list of
+        if nodes.is_empty() {
+            return self.list_nodes_local_env().await;
+        }
+
+        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
+
+        Ok(nodes)
+    }
+
+    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
+    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
+        // Enable test_backward_compatibility to work by populating our list of
        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
        // first startup in the compat test, we may have shards but no nodes.
-        let mut result = Vec::new();
+        use control_plane::local_env::LocalEnv;
+        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
        tracing::info!(
-            "Loaded {} pageserver nodes from LocalEnv",
+            "Loading {} pageserver nodes from LocalEnv",
            env.pageservers.len()
        );
+        let mut nodes = Vec::new();
        for ps_conf in env.pageservers {
            let (pg_host, pg_port) =
                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
                .expect("Unable to parse listen_http_addr");
-            result.push(Node {
+            let node = Node {
                id: ps_conf.id,
                listen_pg_addr: pg_host.to_string(),
                listen_pg_port: pg_port.unwrap_or(5432),
@@ -184,16 +163,96 @@ impl Persistence {
                listen_http_port: http_port.unwrap_or(80),
                availability: NodeAvailability::Active,
                scheduling: NodeSchedulingPolicy::Active,
-            });
+            };
+
+            // Synchronize database with what we learn from LocalEnv
+            self.insert_node(&node).await?;
+
+            nodes.push(node);
        }

-        Ok(result)
+        Ok(nodes)
    }

-    /// At startup, we populate our map of tenant shards from persistent storage.
-    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
-        let inner = self.inner.lock().unwrap();
-        Ok(inner.state.tenants.values().cloned().collect())
+    /// At startup, load the high level state for shards, such as their config + policy.  This will
+    /// be enriched at runtime with state discovered on pageservers.
+    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let loaded = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
+            .await?;
+
+        if loaded.is_empty() {
+            if let Some(path) = &self.json_path {
+                if tokio::fs::try_exists(path)
+                    .await
+                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
+                {
+                    tracing::info!("Importing from legacy JSON format at {path}");
+                    return self.list_tenant_shards_json(path).await;
+                }
+            }
+        }
+        Ok(loaded)
+    }
+
+    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
+    pub(crate) async fn list_tenant_shards_json(
+        &self,
+        path: &Utf8Path,
+    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let bytes = tokio::fs::read(path)
+            .await
+            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
+
+        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
+            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+            }
+        }
+
+        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
+
+        // Synchronize database with what is in the JSON file
+        self.insert_tenant_shards(tenants.clone()).await?;
+
+        Ok(tenants)
+    }
+
+    /// For use in testing environments, where we dump out JSON on shutdown.
+    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
+        let Some(path) = &self.json_path else {
+            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
+        };
+        tracing::info!("Writing state to {path}...");
+        let tenants = self.list_tenant_shards().await?;
+        let mut tenants_map = HashMap::new();
+        for tsp in tenants {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+
+            tenants_map.insert(tenant_shard_id, tsp);
+        }
+        let json = serde_json::to_string(&JsonPersistence {
+            tenants: tenants_map,
+        })?;
+
+        tokio::fs::write(path, &json).await?;
+        tracing::info!("Wrote {} bytes to {path}...", json.len());
+
+        Ok(())
    }

    /// Tenants must be persisted before we schedule them for the first time.  This enables us
@@ -201,22 +260,79 @@ impl Persistence {
    pub(crate) async fn insert_tenant_shards(
        &self,
        shards: Vec<TenantShardPersistence>,
-    ) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            for shard in shards {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
-                    shard_number: ShardNumber(shard.shard_number as u8),
-                    shard_count: ShardCount(shard.shard_count as u8),
-                };
-
-                locked.tenants.insert(tenant_shard_id, shard);
-            }
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
+                Ok(())
+            })?;
            Ok(())
        })
        .await
    }

+    /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
+    /// the tenant from memory on this server.
+    #[allow(unused)]
+    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;
+
+            Ok(())
+        })
+        .await
+    }
+
+    /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
+    /// batched increment of the generations of all tenants whose generation_pageserver is equal to
+    /// the node that called /re-attach.
+    #[tracing::instrument(skip_all, fields(node_id))]
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
+
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
+
+                // TODO: UPDATE+SELECT in one query
+
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        let mut result = HashMap::new();
+        for tsp in updated {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
+                    .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+        }
+
+        Ok(result)
+    }
+
    /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
    /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
    /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
@@ -225,47 +341,46 @@ impl Persistence {
        tenant_shard_id: TenantShardId,
        node_id: NodeId,
    ) -> anyhow::Result<Generation> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;

-            shard.generation += 1;
-            shard.generation_pageserver = Some(node_id);
+                Ok(updated)
+            })
+            .await?;

-            let gen = Generation::new(shard.generation);
-            Ok(gen)
-        })
-        .await
+        Ok(Generation::new(updated.generation as u32))
    }

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
-        self.mutating_transaction(|locked| {
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
-            shard.generation_pageserver = None;
-            shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
-            Ok(())
-        })
-        .await
-    }
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| {
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .set((
+                    generation_pageserver.eq(i64::MAX),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;

-    pub(crate) async fn re_attach(
-        &self,
-        node_id: NodeId,
-    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
-        self.mutating_transaction(|locked| {
-            let mut result = HashMap::new();
-            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
-                if shard.generation_pageserver == Some(node_id) {
-                    shard.generation += 1;
-                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
-                }
-            }
-            Ok(result)
+            Ok(updated)
        })
-        .await
+        .await?;
+
+        Ok(())
    }

    // TODO: when we start shard splitting, we must durably mark the tenant so that
@@ -285,7 +400,8 @@ impl Persistence {
 }

 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
+#[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
    #[serde(default)]
    pub(crate) tenant_id: String,
@@ -296,16 +412,28 @@ pub(crate) struct TenantShardPersistence {
    #[serde(default)]
    pub(crate) shard_stripe_size: i32,

-    // Currently attached pageserver
-    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<NodeId>,
-
    // Latest generation number: next time we attach, increment this
    // and use the incremented number when attaching
-    pub(crate) generation: u32,
+    pub(crate) generation: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: i64,

    #[serde(default)]
    pub(crate) placement_policy: String,
    #[serde(default)]
    pub(crate) config: String,
 }
+
+/// Parts of [`crate::node::Node`] that are stored durably
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[diesel(table_name = crate::schema::nodes)]
+pub(crate) struct NodePersistence {
+    pub(crate) node_id: i64,
+    pub(crate) scheduling_policy: String,
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: i32,
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: i32,
+}
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -0,0 +1,27 @@
+// @generated automatically by Diesel CLI.
+
+diesel::table! {
+    nodes (node_id) {
+        node_id -> Int8,
+        scheduling_policy -> Varchar,
+        listen_http_addr -> Varchar,
+        listen_http_port -> Int4,
+        listen_pg_addr -> Varchar,
+        listen_pg_port -> Int4,
+    }
+}
+
+diesel::table! {
+    tenant_shards (tenant_id, shard_number, shard_count) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        shard_stripe_size -> Int4,
+        generation -> Int4,
+        generation_pageserver -> Int8,
+        placement_policy -> Varchar,
+        config -> Text,
+    }
+}
+
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -11,6 +11,7 @@ use control_plane::attachment_service::{
    TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use diesel::result::DatabaseErrorKind;
 use hyper::StatusCode;
 use pageserver_api::{
    control_api::{
@@ -26,6 +27,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use utils::{
+    completion::Barrier,
    generation::Generation,
    http::error::ApiError,
    id::{NodeId, TenantId},
@@ -35,7 +37,7 @@ use utils::{
 use crate::{
    compute_hook::ComputeHook,
    node::Node,
-    persistence::{Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, Persistence, TenantShardPersistence},
    scheduler::Scheduler,
    tenant_state::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -46,6 +48,10 @@ use crate::{

 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

+/// How long [`Service::startup_reconcile`] is allowed to take before it should give
+/// up on unresponsive pageservers and proceed.
+pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
    tenants: BTreeMap<TenantShardId, TenantState>,
@@ -79,10 +85,27 @@ pub struct Config {
    pub jwt_token: Option<String>,
 }

+impl From<DatabaseError> for ApiError {
+    fn from(err: DatabaseError) -> ApiError {
+        match err {
+            DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
+            // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
+            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
+            DatabaseError::Logical(reason) => {
+                ApiError::InternalServerError(anyhow::anyhow!(reason))
+            }
+        }
+    }
+}
+
 pub struct Service {
    inner: Arc<std::sync::RwLock<ServiceState>>,
    config: Config,
    persistence: Arc<Persistence>,
+
+    /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
+    /// passes, it isn't safe to do any actions that mutate tenants.
+    pub(crate) startup_complete: Barrier,
 }

 impl From<ReconcileWaitError> for ApiError {
@@ -96,77 +119,32 @@ impl From<ReconcileWaitError> for ApiError {
 }

 impl Service {
-    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-        tracing::info!("Loading nodes from database...");
-        let mut nodes = persistence.list_nodes().await?;
-        tracing::info!("Loaded {} nodes from database.", nodes.len());
-
-        tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
-        tracing::info!(
-            "Loaded {} shards from database.",
-            tenant_shard_persistence.len()
-        );
-
-        let mut tenants = BTreeMap::new();
-
-        for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                // Note that we load generation, but don't care about generation_pageserver.  We will either end up finding
-                // our existing attached location and it will match generation_pageserver, or we will attach somewhere new
-                // and update generation_pageserver in the process.
-                generation: Generation::new(tsp.generation),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent: IntentState::new(),
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-            };
-
-            tenants.insert(tenant_shard_id, new_tenant);
-        }
+    pub fn get_config(&self) -> &Config {
+        &self.config
+    }

+    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
+    /// until this is done.
+    async fn startup_reconcile(&self) {
        // For all tenant shards, a vector of observed states on nodes (where None means
        // indeterminate, same as in [`ObservedStateLocation`])
        let mut observed = HashMap::new();

+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
        // TODO: issue these requests concurrently
-        for node in &mut nodes {
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+        for node in nodes.values() {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());

            tracing::info!("Scanning shards on node {}...", node.id);
            match client.list_location_config().await {
                Err(e) => {
                    tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                    // TODO: be more tolerant, apply a generous 5-10 second timeout
-                    // TODO: setting a node to Offline is a dramatic thing to do, and can
-                    // prevent neon_local from starting up (it starts this service before
-                    // any pageservers are  running).  It may make sense to give nodes
-                    // a Pending state to accomodate this situation, and allow (but deprioritize)
-                    // scheduling on Pending nodes.
-                    //node.availability = NodeAvailability::Offline;
+                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
+                    // pageserver is being restarted at the same time as we are
                }
                Ok(listing) => {
                    tracing::info!(
@@ -174,7 +152,6 @@ impl Service {
                        listing.tenant_shards.len(),
                        node.id
                    );
-                    node.availability = NodeAvailability::Active;

                    for (tenant_shard_id, conf_opt) in listing.tenant_shards {
                        observed.insert(tenant_shard_id, (node.id, conf_opt));
@@ -186,41 +163,46 @@ impl Service {
        let mut cleanup = Vec::new();

        // Populate intent and observed states for all tenants, based on reported state on pageservers
-        for (tenant_shard_id, (node_id, observed_loc)) in observed {
-            let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                cleanup.push((tenant_shard_id, node_id));
-                continue;
-            };
+        let shard_count = {
+            let mut locked = self.inner.write().unwrap();
+            for (tenant_shard_id, (node_id, observed_loc)) in observed {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push((tenant_shard_id, node_id));
+                    continue;
+                };

-            tenant_state
-                .observed
-                .locations
-                .insert(node_id, ObservedStateLocation { conf: observed_loc });
-        }
-
-        // State of nodes is now frozen, transform to a HashMap.
-        let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
-
-        // Populate each tenant's intent state
-        let mut scheduler = Scheduler::new(&tenants, &nodes);
-        for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-            tenant_state.intent_from_observed();
-            if let Err(e) = tenant_state.schedule(&mut scheduler) {
-                // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
-                // not enough pageservers are available.  The tenant may well still be available
-                // to clients.
-                tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
            }
-        }
+
+            // Populate each tenant's intent state
+            let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
+            for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
+                tenant_state.intent_from_observed();
+                if let Err(e) = tenant_state.schedule(&mut scheduler) {
+                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
+                    // not enough pageservers are available.  The tenant may well still be available
+                    // to clients.
+                    tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                }
+            }
+
+            locked.tenants.len()
+        };
+
+        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
+        // generation_pageserver in the database.

        // Clean up any tenants that were found on pageservers but are not known to us.
        for (tenant_shard_id, node_id) in cleanup {
            // A node reported a tenant_shard_id which is unknown to us: detach it.
            let node = nodes
-                .get_mut(&node_id)
+                .get(&node_id)
                .expect("Always exists: only known nodes are scanned");

-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
            match client
                .location_config(
                    tenant_shard_id,
@@ -252,13 +234,80 @@ impl Service {
            }
        }

-        let shard_count = tenants.len();
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted.
+        let reconcile_tasks = self.reconcile_all();
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
+        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        tracing::info!("Loading nodes from database...");
+        let nodes = persistence.list_nodes().await?;
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        tracing::info!("Loaded {} nodes from database.", nodes.len());
+
+        tracing::info!("Loading shards from database...");
+        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        tracing::info!(
+            "Loaded {} shards from database.",
+            tenant_shard_persistence.len()
+        );
+
+        let mut tenants = BTreeMap::new();
+
+        for tsp in tenant_shard_persistence {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            let shard_identity = if tsp.shard_count == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                ShardIdentity::new(
+                    ShardNumber(tsp.shard_number as u8),
+                    ShardCount(tsp.shard_count as u8),
+                    ShardStripeSize(tsp.shard_stripe_size as u32),
+                )?
+            };
+
+            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
+            // it with what we can infer: the node for which a generation was most recently issued.
+            let mut intent = IntentState::new();
+            if tsp.generation_pageserver != i64::MAX {
+                intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
+            }
+
+            let new_tenant = TenantState {
+                tenant_shard_id,
+                shard: shard_identity,
+                sequence: Sequence::initial(),
+                generation: Generation::new(tsp.generation as u32),
+                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+                intent,
+                observed: ObservedState::new(),
+                config: serde_json::from_str(&tsp.config).unwrap(),
+                reconciler: None,
+                waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                last_error: Arc::default(),
+            };
+
+            tenants.insert(tenant_shard_id, new_tenant);
+        }
+
+        let (startup_completion, startup_complete) = utils::completion::channel();
+
        let this = Arc::new(Self {
            inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                result_tx, nodes, tenants,
            ))),
            config,
            persistence,
+            startup_complete,
        });

        let result_task_this = this.clone();
@@ -316,11 +365,13 @@ impl Service {
            }
        });

-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted.
-        let reconcile_tasks = this.reconcile_all();
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        let startup_reconcile_this = this.clone();
+        tokio::task::spawn(async move {
+            // Block the [`Service::startup_complete`] barrier until we're done
+            let _completion = startup_completion;
+
+            startup_reconcile_this.startup_reconcile().await
+        });

        Ok(this)
    }
@@ -336,7 +387,6 @@ impl Service {
            let locked = self.inner.write().unwrap();
            !locked.tenants.contains_key(&attach_req.tenant_shard_id)
        };
-
        if insert {
            let tsp = TenantShardPersistence {
                tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
@@ -344,22 +394,39 @@ impl Service {
                shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
                shard_stripe_size: 0,
                generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
            };

-            self.persistence.insert_tenant_shards(vec![tsp]).await?;
+            match self.persistence.insert_tenant_shards(vec![tsp]).await {
+                Err(e) => match e {
+                    DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        DatabaseErrorKind::UniqueViolation,
+                        _,
+                    )) => {
+                        tracing::info!(
+                            "Raced with another request to insert tenant {}",
+                            attach_req.tenant_shard_id
+                        )
+                    }
+                    _ => return Err(e.into()),
+                },
+                Ok(()) => {
+                    tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);

-            let mut locked = self.inner.write().unwrap();
-            locked.tenants.insert(
-                attach_req.tenant_shard_id,
-                TenantState::new(
-                    attach_req.tenant_shard_id,
-                    ShardIdentity::unsharded(),
-                    PlacementPolicy::Single,
-                ),
-            );
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(
+                        attach_req.tenant_shard_id,
+                        TenantState::new(
+                            attach_req.tenant_shard_id,
+                            ShardIdentity::unsharded(),
+                            PlacementPolicy::Single,
+                        ),
+                    );
+                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
+                }
+            }
        }

        let new_generation = if let Some(req_node_id) = attach_req.node_id {
@@ -506,6 +573,14 @@ impl Service {
                    id: req_tenant.id,
                    valid,
                });
+            } else {
+                // After tenant deletion, we may approve any validation.  This avoids
+                // spurious warnings on the pageserver if it has pending LSN updates
+                // at the point a deletion happens.
+                response.tenants.push(ValidateResponseTenant {
+                    id: req_tenant.id,
+                    valid: true,
+                });
            }
        }
        response
@@ -561,7 +636,7 @@ impl Service {
                shard_count: tenant_shard_id.shard_count.0 as i32,
                shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
            })
@@ -967,10 +1042,7 @@ impl Service {
            availability: NodeAvailability::Active,
        };
        // TODO: idempotency if the node already exists in the database
-        self.persistence
-            .insert_node(&new_node)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+        self.persistence.insert_node(&new_node).await?;

        let mut locked = self.inner.write().unwrap();
        let mut new_nodes = (*locked.nodes).clone();
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,5 +1,11 @@
 use crate::{background_process, local_env::LocalEnv};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
+use diesel::{
+    backend::Backend,
+    query_builder::{AstPass, QueryFragment, QueryId},
+    Connection, PgConnection, QueryResult, RunQueryDsl,
+};
+use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
@@ -7,9 +13,9 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
-use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{path::PathBuf, process::Child, str::FromStr};
+use std::{env, str::FromStr};
+use tokio::process::Command;
 use tracing::instrument;
 use utils::{
    auth::{Claims, Scope},
@@ -19,14 +25,17 @@ use utils::{
 pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
-    path: PathBuf,
+    path: Utf8PathBuf,
    jwt_token: Option<String>,
    public_key_path: Option<Utf8PathBuf>,
+    postgres_port: u16,
    client: reqwest::Client,
 }

 const COMMAND: &str = "attachment_service";

+const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -169,7 +178,9 @@ pub struct TenantShardMigrateResponse {}

 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
-        let path = env.base_data_dir.join("attachments.json");
+        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
+            .unwrap()
+            .join("attachments.json");

        // Makes no sense to construct this if pageservers aren't going to use it: assume
        // pageservers have control plane API set
@@ -181,6 +192,13 @@ impl AttachmentService {
            listen_url.port().unwrap()
        );

+        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
+        // port, for use by our captive postgres.
+        let postgres_port = listen_url
+            .port()
+            .expect("Control plane API setting should always have a port")
+            + 1;
+
        // Assume all pageservers have symmetric auth configuration: this service
        // expects to use one JWT token to talk to all of them.
        let ps_conf = env
@@ -209,6 +227,7 @@ impl AttachmentService {
            listen,
            jwt_token,
            public_key_path,
+            postgres_port,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
@@ -220,13 +239,214 @@ impl AttachmentService {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self) -> anyhow::Result<Child> {
-        let path_str = self.path.to_string_lossy();
+    /// PIDFile for the postgres instance used to store attachment service state
+    fn postgres_pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.env
+                .base_data_dir
+                .join("attachment_service_postgres.pid"),
+        )
+        .expect("non-Unicode path")
+    }

-        let mut args = vec!["-l", &self.listen, "-p", &path_str]
-            .into_iter()
-            .map(|s| s.to_string())
-            .collect::<Vec<_>>();
+    /// In order to access database migrations, we need to find the Neon source tree
+    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
+        // We assume that either prd or our binary is in the source tree. The former is usually
+        // true for automated test runners, the latter is usually true for developer workstations. Often
+        // both are true, which is fine.
+        let candidate_start_points = [
+            // Current working directory
+            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
+            // Directory containing the binary we're running inside
+            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
+        ];
+
+        // For each candidate start point, search through ancestors looking for a neon.git source tree root
+        for start_point in &candidate_start_points {
+            // Start from the build dir: assumes we are running out of a built neon source tree
+            for path in start_point.ancestors() {
+                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
+                // subdirectory.
+                let control_plane = path.join("control_plane");
+                if tokio::fs::try_exists(&control_plane).await? {
+                    return Ok(path.to_owned());
+                }
+            }
+        }
+
+        // Fall-through
+        Err(anyhow::anyhow!(
+            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
+        ))
+    }
+
+    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    ///
+    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// to other versions if that one isn't found.  Some automated tests create circumstances
+    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+
+        for v in prefer_versions {
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            if tokio::fs::try_exists(&path).await? {
+                return Ok(path);
+            }
+        }
+
+        // Fall through
+        anyhow::bail!(
+            "Postgres binaries not found in {}",
+            self.env.pg_distrib_dir.display()
+        );
+    }
+
+    /// Readiness check for our postgres process
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+        let bin_path = pg_bin_dir.join("pg_isready");
+        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+
+        Ok(exitcode.success())
+    }
+
+    /// Create our database if it doesn't exist, and run migrations.
+    ///
+    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
+    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
+    /// who just want to run `cargo neon_local` without knowing about diesel.
+    ///
+    /// Returns the database url
+    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        let database_url = format!(
+            "postgresql://localhost:{}/attachment_service",
+            self.postgres_port
+        );
+        println!("Running attachment service database setup...");
+        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
+            let base = ::url::Url::parse(database_url).unwrap();
+            let database = base.path_segments().unwrap().last().unwrap().to_owned();
+            let mut new_url = base.join(default_database).unwrap();
+            new_url.set_query(base.query());
+            (database, new_url.into())
+        }
+
+        #[derive(Debug, Clone)]
+        pub struct CreateDatabaseStatement {
+            db_name: String,
+        }
+
+        impl CreateDatabaseStatement {
+            pub fn new(db_name: &str) -> Self {
+                CreateDatabaseStatement {
+                    db_name: db_name.to_owned(),
+                }
+            }
+        }
+
+        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
+            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
+                out.push_sql("CREATE DATABASE ");
+                out.push_identifier(&self.db_name)?;
+                Ok(())
+            }
+        }
+
+        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
+
+        impl QueryId for CreateDatabaseStatement {
+            type QueryId = ();
+
+            const HAS_STATIC_QUERY_ID: bool = false;
+        }
+        if PgConnection::establish(&database_url).is_err() {
+            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
+            println!("Creating database: {database}");
+            let mut conn = PgConnection::establish(&postgres_url)?;
+            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
+        }
+        let mut conn = PgConnection::establish(&database_url)?;
+
+        let migrations_dir = self
+            .find_source_root()
+            .await?
+            .join("control_plane/attachment_service/migrations");
+
+        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
+        println!("Running migrations in {}", migrations.path().display());
+        HarnessWithOutput::write_to_stdout(&mut conn)
+            .run_pending_migrations(migrations)
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        println!("Migrations complete");
+
+        Ok(database_url)
+    }
+
+    pub async fn start(&self) -> anyhow::Result<()> {
+        // Start a vanilla Postgres process used by the attachment service for persistence.
+        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+            .unwrap()
+            .join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_log_path = pg_data_path.join("postgres.log");
+
+        if !tokio::fs::try_exists(&pg_data_path).await? {
+            // Initialize empty database
+            let initdb_path = pg_bin_dir.join("initdb");
+            let mut child = Command::new(&initdb_path)
+                .args(["-D", pg_data_path.as_ref()])
+                .spawn()
+                .expect("Failed to spawn initdb");
+            let status = child.wait().await?;
+            if !status.success() {
+                anyhow::bail!("initdb failed with status {status}");
+            }
+
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}", self.postgres_port),
+            )
+            .await?;
+        };
+
+        println!("Starting attachment service database...");
+        let db_start_args = [
+            "-w",
+            "-D",
+            pg_data_path.as_ref(),
+            "-l",
+            pg_log_path.as_ref(),
+            "start",
+        ];
+
+        background_process::start_process(
+            "attachment_service_db",
+            &self.env.base_data_dir,
+            pg_bin_dir.join("pg_ctl").as_std_path(),
+            db_start_args,
+            [],
+            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            || self.pg_isready(&pg_bin_dir),
+        )
+        .await?;
+
+        // Run migrations on every startup, in case something changed.
+        let database_url = self.setup_database().await?;
+
+        let mut args = vec![
+            "-l",
+            &self.listen,
+            "-p",
+            self.path.as_ref(),
+            "--database-url",
+            &database_url,
+        ]
+        .into_iter()
+        .map(|s| s.to_string())
+        .collect::<Vec<_>>();
        if let Some(jwt_token) = &self.jwt_token {
            args.push(format!("--jwt-token={jwt_token}"));
        }
@@ -235,7 +455,7 @@ impl AttachmentService {
            args.push(format!("--public-key={public_key_path}"));
        }

-        let result = background_process::start_process(
+        background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
            &self.env.attachment_service_bin(),
@@ -252,29 +472,46 @@ impl AttachmentService {
                }
            },
        )
-        .await;
+        .await?;

-        for ps_conf in &self.env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            self.node_register(NodeRegisterRequest {
-                node_id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-            })
+        Ok(())
+    }
+
+    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+
+        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        println!("Stopping attachment service database...");
+        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
+        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_stop_args)
+            .spawn()?
+            .wait()
            .await?;
+        if !stop_status.success() {
+            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+                .args(pg_status_args)
+                .spawn()?
+                .wait()
+                .await?;
+
+            // pg_ctl status returns this exit code if postgres is not running: in this case it is
+            // fine that stop failed.  Otherwise it is an error that stop failed.
+            const PG_STATUS_NOT_RUNNING: i32 = 3;
+            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
+                println!("Attachment service data base is already stopped");
+                return Ok(());
+            } else {
+                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+            }
        }

-        result
+        Ok(())
    }

-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())
-    }
    /// Simple HTTP request wrapper for calling into attachment service
    async fn dispatch<RQ, RS>(
        &self,
@@ -356,7 +593,7 @@ impl AttachmentService {
        &self,
        req: TenantCreateRequest,
    ) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+        self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
            .await
    }

@@ -413,7 +650,7 @@ impl AttachmentService {
    ) -> anyhow::Result<TimelineInfo> {
        self.dispatch(
            Method::POST,
-            format!("tenant/{tenant_id}/timeline"),
+            format!("v1/tenant/{tenant_id}/timeline"),
            Some(req),
        )
        .await
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -17,7 +17,7 @@ use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;
 use std::{fs, io, thread};

@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
    envs: EI,
    initial_pid_file: InitialPidFile,
    process_status_check: F,
-) -> anyhow::Result<Child>
+) -> anyhow::Result<()>
 where
    F: Fn() -> Fut,
    Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -98,7 +98,7 @@ where
        InitialPidFile::Expect(path) => path,
    };

-    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+    let spawned_process = filled_cmd.spawn().with_context(|| {
        format!("Could not spawn {process_name}, see console output and log files for details.")
    })?;
    let pid = spawned_process.id();
@@ -106,12 +106,26 @@ where
        i32::try_from(pid)
            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
    );
+    // set up a scopeguard to kill & wait for the child in case we panic or bail below
+    let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
+        println!("SIGKILL & wait the started process");
+        (|| {
+            // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
+            spawned_process.kill().context("SIGKILL child")?;
+            spawned_process.wait().context("wait() for child process")?;
+            anyhow::Ok(())
+        })()
+        .with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
+        .unwrap();
+    });

    for retries in 0..RETRIES {
        match process_started(pid, pid_file_to_check, &process_status_check).await {
            Ok(true) => {
-                println!("\n{process_name} started, pid: {pid}");
-                return Ok(spawned_process);
+                println!("\n{process_name} started and passed status check, pid: {pid}");
+                // leak the child process, it'll outlive this neon_local invocation
+                drop(scopeguard::ScopeGuard::into_inner(spawned_process));
+                return Ok(());
            }
            Ok(false) => {
                if retries == NOTICE_AFTER_RETRIES {
@@ -126,16 +140,15 @@ where
                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
-                println!("{process_name} failed to start: {e:#}");
-                if let Err(e) = spawned_process.kill() {
-                    println!("Could not stop {process_name} subprocess: {e:#}")
-                };
+                println!("error starting process {process_name:?}: {e:#}");
                return Err(e);
            }
        }
    }
    println!();
-    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
+    anyhow::bail!(
+        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
+    );
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
            "start" => rt.block_on(handle_start_all(sub_args, &env)),
-            "stop" => handle_stop_all(sub_args, &env),
+            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
            "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
+            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
            if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), *register)
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }

            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("migrate", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), false)
                .await
            {
                eprintln!("pageserver start failed: {e}");
@@ -1161,7 +1145,7 @@ async fn handle_attachment_service(
                .map(|s| s.as_str())
                == Some("immediate");

-            if let Err(e) = svc.stop(immediate) {
+            if let Err(e) = svc.stop(immediate).await {
                eprintln!("stop failed: {}", e);
                exit(1);
            }
@@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        let attachment_service = AttachmentService::from_env(env);
        if let Err(e) = attachment_service.start().await {
            eprintln!("attachment_service start failed: {:#}", e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
            exit(1);
        }
    }
@@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
        if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match))
+            .start(&pageserver_config_overrides(sub_match), true)
            .await
        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
            exit(1);
        }
    }
@@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
        let safekeeper = SafekeeperNode::from_env(env, node);
        if let Err(e) = safekeeper.start(vec![]).await {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false);
+            try_stop_all(env, false).await;
            exit(1);
        }
    }
    Ok(())
 }

-fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let immediate =
        sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");

-    try_stop_all(env, immediate);
+    try_stop_all(env, immediate).await;

    Ok(())
 }

-fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
@@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {

    if env.control_plane_api.is_some() {
        let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate) {
+        if let Err(e) = attachment_service.stop(immediate).await {
            eprintln!("attachment service stop failed: {e:#}");
        }
    }
@@ -1549,7 +1533,11 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
+                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
+                    .long("register")
+                    .default_value("true").required(false)
+                    .value_parser(value_parser!(bool))
+                    .value_name("register"))
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -438,7 +438,7 @@ impl Endpoint {
    }

    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
-        // TODO use background_process::stop_process instead
+        // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -583,9 +583,21 @@ impl Endpoint {
        }

        let child = cmd.spawn()?;
+        // set up a scopeguard to kill & wait for the child in case we panic or bail below
+        let child = scopeguard::guard(child, |mut child| {
+            println!("SIGKILL & wait the started process");
+            (|| {
+                // TODO: use another signal that can be caught by the child so it can clean up any children it spawned
+                child.kill().context("SIGKILL child")?;
+                child.wait().context("wait() for child process")?;
+                anyhow::Ok(())
+            })()
+            .with_context(|| format!("scopeguard kill&wait child {child:?}"))
+            .unwrap();
+        });

        // Write down the pid so we can wait for it when we want to stop
-        // TODO use background_process::start_process instead
+        // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
        let pid = child.id();
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        std::fs::write(pidfile_path, pid.to_string())?;
@@ -634,6 +646,9 @@ impl Endpoint {
            std::thread::sleep(ATTEMPT_INTERVAL);
        }

+        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
+        drop(scopeguard::ScopeGuard::into_inner(child));
+
        Ok(())
    }

--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -223,7 +223,11 @@ impl LocalEnv {
    }

    pub fn attachment_service_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("attachment_service")
+        // Irrespective of configuration, attachment service binary is always
+        // run from the same location as neon_local.  This means that for compatibility
+        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
+        neon_local_bin_dir.join("attachment_service")
    }

    pub fn safekeeper_bin(&self) -> PathBuf {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,7 +11,7 @@ use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;

 use anyhow::{bail, Context};
@@ -30,6 +30,7 @@ use utils::{
    lsn::Lsn,
 };

+use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

@@ -161,8 +162,8 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
-        self.start_node(config_overrides, false).await
+    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false, register).await
    }

    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -207,7 +208,8 @@ impl PageServerNode {
        &self,
        config_overrides: &[&str],
        update_config: bool,
-    ) -> anyhow::Result<Child> {
+        register: bool,
+    ) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -244,7 +246,26 @@ impl PageServerNode {
                }
            },
        )
-        .await
+        .await?;
+
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
+        Ok(())
    }

    fn pageserver_basic_args<'a>(
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,7 +7,6 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Child;
 use std::{io, result};

 use anyhow::Context;
@@ -104,7 +103,7 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
--- a/diesel.toml
+++ b/diesel.toml
@@ -0,0 +1,9 @@
+# For documentation on how to configure this file,
+# see https://diesel.rs/guides/configuring-diesel-cli
+
+[print_schema]
+file = "control_plane/attachment_service/src/schema.rs"
+custom_type_derives = ["diesel::query_builder::QueryId"]
+
+[migrations_directory]
+dir = "control_plane/attachment_service/migrations"
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -9,5 +9,10 @@ prometheus.workspace = true
 libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
+twox-hash.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+rand = "0.8"
+rand_distr = "0.4.3"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -0,0 +1,523 @@
+//! HyperLogLog is an algorithm for the count-distinct problem,
+//! approximating the number of distinct elements in a multiset.
+//! Calculating the exact cardinality of the distinct elements
+//! of a multiset requires an amount of memory proportional to
+//! the cardinality, which is impractical for very large data sets.
+//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
+//! use significantly less memory than this, but can only approximate the cardinality.
+
+use std::{
+    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::{atomic::AtomicU8, Arc, RwLock},
+};
+
+use prometheus::{
+    core::{self, Describer},
+    proto, Opts,
+};
+use twox_hash::xxh3;
+
+/// Create an [`HyperLogLogVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll_vec {
+    ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
+        let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
+        $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
+        $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// Create an [`HyperLogLog`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll {
+    ($N:literal, $OPTS:expr $(,)?) => {{
+        let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
+        $crate::register(Box::new(hll.clone())).map(|_| hll)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLogVec<const N: usize> {
+    core: Arc<HyperLogLogVecCore<N>>,
+}
+
+struct HyperLogLogVecCore<const N: usize> {
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+    pub desc: core::Desc,
+    pub opts: Opts,
+}
+
+impl<const N: usize> core::Collector for HyperLogLogVec<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        for child in self.core.children.read().unwrap().values() {
+            child.core.collect_into(&mut metrics);
+        }
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogVec<N> {
+    /// Create a new [`HyperLogLogVec`] based on the provided
+    /// [`Opts`] and partitioned by the given label names. At least one label name must be
+    /// provided.
+    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
+        let opts = opts.variable_labels(variable_names);
+
+        let desc = opts.describe()?;
+        let v = HyperLogLogVecCore {
+            children: RwLock::new(HashMap::default()),
+            desc,
+            opts,
+        };
+
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        self.core.get_metric_with_label_values(vals)
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<const N: usize> HyperLogLogVecCore<N> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let h = self.hash_label_values(vals)?;
+
+        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
+            return Ok(metric);
+        }
+
+        self.get_or_create_metric(h, vals)
+    }
+
+    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
+        if vals.len() != self.desc.variable_labels.len() {
+            return Err(prometheus::Error::InconsistentCardinality {
+                expect: self.desc.variable_labels.len(),
+                got: vals.len(),
+            });
+        }
+
+        let mut h = xxh3::Hash64::default();
+        for val in vals {
+            h.write(val.as_bytes());
+        }
+
+        Ok(h.finish())
+    }
+
+    fn get_or_create_metric(
+        &self,
+        hash: u64,
+        label_values: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let mut children = self.children.write().unwrap();
+        // Check exist first.
+        if let Some(metric) = children.get(&hash).cloned() {
+            return Ok(metric);
+        }
+
+        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
+        children.insert(hash, metric.clone());
+        Ok(metric)
+    }
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLog<const N: usize> {
+    core: Arc<HyperLogLogCore<N>>,
+}
+
+impl<const N: usize> HyperLogLog<N> {
+    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
+    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let opts = Opts::new(name, help);
+        Self::with_opts(opts)
+    }
+
+    /// Create a [`HyperLogLog`] with the `opts` options.
+    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
+        Self::with_opts_and_label_values(&opts, &[])
+    }
+
+    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
+        let desc = opts.describe()?;
+        let labels = make_label_pairs(&desc, label_values)?;
+
+        let v = HyperLogLogCore {
+            shards: [0; N].map(AtomicU8::new),
+            desc,
+            labels,
+        };
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    pub fn measure(&self, item: &impl Hash) {
+        // changing the hasher will break compatibility with previous measurements.
+        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
+    }
+
+    fn record(&self, hash: u64) {
+        let p = N.ilog2() as u8;
+        let j = hash & (N as u64 - 1);
+        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+struct HyperLogLogCore<const N: usize> {
+    shards: [AtomicU8; N],
+    desc: core::Desc,
+    labels: Vec<proto::LabelPair>,
+}
+
+impl<const N: usize> core::Collector for HyperLogLog<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        self.core.collect_into(&mut metrics);
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogCore<N> {
+    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
+        self.shards.iter().enumerate().for_each(|(i, x)| {
+            let mut shard_label = proto::LabelPair::default();
+            shard_label.set_name("hll_shard".to_owned());
+            shard_label.set_value(format!("{i}"));
+
+            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
+
+            // This seems like it would be a race condition,
+            // but HLL is not impacted by a write in one shard happening in between.
+            // This is because in PromQL we will be implementing a harmonic mean of all buckets.
+            // we will also merge samples in a time series using `max by (hll_shard)`.
+
+            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
+            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+
+            let mut m = proto::Metric::default();
+            let mut c = proto::Gauge::default();
+            c.set_value(v as f64);
+            m.set_gauge(c);
+
+            let mut labels = Vec::with_capacity(self.labels.len() + 1);
+            labels.extend_from_slice(&self.labels);
+            labels.push(shard_label);
+
+            m.set_label(labels);
+            metrics.push(m);
+        })
+    }
+}
+
+fn make_label_pairs(
+    desc: &core::Desc,
+    label_values: &[&str],
+) -> prometheus::Result<Vec<proto::LabelPair>> {
+    if desc.variable_labels.len() != label_values.len() {
+        return Err(prometheus::Error::InconsistentCardinality {
+            expect: desc.variable_labels.len(),
+            got: label_values.len(),
+        });
+    }
+
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+    if total_len == 0 {
+        return Ok(vec![]);
+    }
+
+    if desc.variable_labels.is_empty() {
+        return Ok(desc.const_label_pairs.clone());
+    }
+
+    let mut label_pairs = Vec::with_capacity(total_len);
+    for (i, n) in desc.variable_labels.iter().enumerate() {
+        let mut label_pair = proto::LabelPair::default();
+        label_pair.set_name(n.clone());
+        label_pair.set_value(label_values[i].to_owned());
+        label_pairs.push(label_pair);
+    }
+
+    for label_pair in &desc.const_label_pairs {
+        label_pairs.push(label_pair.clone());
+    }
+    label_pairs.sort();
+    Ok(label_pairs)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use prometheus::{proto, Opts};
+    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use rand_distr::{Distribution, Zipf};
+
+    use crate::HyperLogLogVec;
+
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+        let mut metrics = vec![];
+        hll.core
+            .children
+            .read()
+            .unwrap()
+            .values()
+            .for_each(|c| c.core.collect_into(&mut metrics));
+        metrics
+    }
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+        let mut buckets = [0.0; 32];
+        for metric in metrics.chunks_exact(32) {
+            if filter(&metric[0]) {
+                for (i, m) in metric.iter().enumerate() {
+                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
+                }
+            }
+        }
+
+        buckets
+            .into_iter()
+            .map(|f| 2.0f64.powf(-f))
+            .sum::<f64>()
+            .recip()
+            * 0.697
+            * 32.0
+            * 32.0
+    }
+
+    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+
+        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
+        let mut set_a = HashSet::new();
+        let mut set_b = HashSet::new();
+
+        for x in iter.by_ref().take(n) {
+            set_a.insert(x.to_bits());
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
+        }
+        for x in iter.by_ref().take(n) {
+            set_b.insert(x.to_bits());
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
+        }
+        let merge = &set_a | &set_b;
+
+        let metrics = collect(&hll);
+        let len = get_cardinality(&metrics, |_| true);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+
+        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
+    }
+
+    #[test]
+    fn test_cardinality_small() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
+
+        assert_eq!(actual, [46, 30, 32]);
+        assert!(51.3 < estimate[0] && estimate[0] < 51.4);
+        assert!(44.0 < estimate[1] && estimate[1] < 44.1);
+        assert!(39.0 < estimate[2] && estimate[2] < 39.1);
+    }
+
+    #[test]
+    fn test_cardinality_medium() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [2529, 1618, 1629]);
+        assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
+        assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
+        assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
+    }
+
+    #[test]
+    fn test_cardinality_large() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [129077, 79579, 79630]);
+        assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
+        assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
+        assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
+    }
+
+    #[test]
+    fn test_cardinality_small2() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
+
+        assert_eq!(actual, [92, 58, 60]);
+        assert!(116.1 < estimate[0] && estimate[0] < 116.2);
+        assert!(81.7 < estimate[1] && estimate[1] < 81.8);
+        assert!(69.3 < estimate[2] && estimate[2] < 69.4);
+    }
+
+    #[test]
+    fn test_cardinality_medium2() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [8201, 5131, 5051]);
+        assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
+        assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
+        assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
+    }
+
+    #[test]
+    fn test_cardinality_large2() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [777847, 482069, 482246]);
+        assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
+        assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
+        assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
+    }
+}
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -28,7 +28,9 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
+mod hll;
 pub mod metric_vec_duration;
+pub use hll::{HyperLogLog, HyperLogLogVec};

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,6 +8,7 @@ use std::pin::Pin;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
@@ -23,6 +24,7 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::{StatusCode, Url};
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -183,7 +185,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
    }
 }

-#[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
    async fn list(
        &self,
@@ -371,6 +372,20 @@ impl RemoteStorage for AzureBlobStorage {
            copy_status = status;
        }
    }
+
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        // TODO use Azure point in time recovery feature for this
+        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
+        Err(anyhow::anyhow!(
+            "time travel recovery for azure blob storage is not implemented"
+        ))
+    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -25,6 +25,7 @@ use bytes::Bytes;
 use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;
 use toml_edit::Item;
 use tracing::info;

@@ -142,7 +143,7 @@ pub struct Listing {
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
-#[async_trait::async_trait]
+#[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
    /// Lists all top level subdirectories for a given prefix
    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static {

    /// Copy a remote object inside a bucket from one path to another.
    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
+
+    /// Resets the content of everything with the given prefix to the given state
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()>;
 }

 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -262,14 +272,15 @@ impl std::error::Error for DownloadError {}
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
-pub enum GenericRemoteStorage {
+// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
+pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
    AzureBlob(Arc<AzureBlobStorage>),
-    Unreliable(Arc<UnreliableWrapper>),
+    Unreliable(Other),
 }

-impl GenericRemoteStorage {
+impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
    pub async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -386,6 +397,33 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.copy(from, to).await,
        }
    }
+
+    pub async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AwsS3(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AzureBlob(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::Unreliable(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+        }
+    }
 }

 impl GenericRemoteStorage {
@@ -673,6 +711,7 @@ impl ConcurrencyLimiter {
            RequestKind::List => &self.read,
            RequestKind::Delete => &self.write,
            RequestKind::Copy => &self.write,
+            RequestKind::TimeTravel => &self.write,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,7 +4,7 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};

 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -14,7 +14,7 @@ use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
-use tokio_util::io::ReaderStream;
+use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

@@ -157,7 +157,6 @@ impl LocalFs {
    }
 }

-#[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
    async fn list(
        &self,
@@ -423,6 +422,17 @@ impl RemoteStorage for LocalFs {
        })?;
        Ok(())
    }
+
+    #[allow(clippy::diverging_sub_expression)]
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        unimplemented!()
+    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -6,12 +6,14 @@

 use std::{
    borrow::Cow,
+    collections::HashMap,
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
+    time::SystemTime,
 };

-use anyhow::Context as _;
+use anyhow::{anyhow, Context as _};
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
@@ -27,17 +29,19 @@ use aws_sdk_s3::{
    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, ObjectIdentifier},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;

-use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
+use aws_smithy_types::{body::SdkBody, DateTime};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
+use utils::backoff;

 use super::StorageMetadata;
 use crate::{
@@ -270,6 +274,59 @@ impl S3Bucket {
            }
        }
    }
+
+    async fn delete_oids(
+        &self,
+        kind: RequestKind,
+        delete_objects: &[ObjectIdentifier],
+    ) -> anyhow::Result<()> {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+            let started_at = start_measuring_requests(kind);
+
+            let resp = self
+                .client
+                .delete_objects()
+                .bucket(self.bucket_name.clone())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
+                .send()
+                .await;
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
+            let resp = resp?;
+            metrics::BUCKET_METRICS
+                .deleted_objects_total
+                .inc_by(chunk.len() as u64);
+            if let Some(errors) = resp.errors {
+                // Log a bounded number of the errors within the response:
+                // these requests can carry 1000 keys so logging each one
+                // would be too verbose, especially as errors may lead us
+                // to retry repeatedly.
+                const LOG_UP_TO_N_ERRORS: usize = 10;
+                for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                    tracing::warn!(
+                        "DeleteObjects key {} failed: {}: {}",
+                        e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                    );
+                }
+
+                return Err(anyhow::format_err!(
+                    "Failed to delete {} objects",
+                    errors.len()
+                ));
+            }
+        }
+        Ok(())
+    }
 }

 pin_project_lite::pin_project! {
@@ -373,7 +430,6 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
    }
 }

-#[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    async fn list(
        &self,
@@ -569,64 +625,195 @@ impl RemoteStorage for S3Bucket {
            delete_objects.push(obj_id);
        }

-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
-            let started_at = start_measuring_requests(kind);
-
-            let resp = self
-                .client
-                .delete_objects()
-                .bucket(self.bucket_name.clone())
-                .delete(
-                    Delete::builder()
-                        .set_objects(Some(chunk.to_vec()))
-                        .build()?,
-                )
-                .send()
-                .await;
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
-            match resp {
-                Ok(resp) => {
-                    metrics::BUCKET_METRICS
-                        .deleted_objects_total
-                        .inc_by(chunk.len() as u64);
-                    if let Some(errors) = resp.errors {
-                        // Log a bounded number of the errors within the response:
-                        // these requests can carry 1000 keys so logging each one
-                        // would be too verbose, especially as errors may lead us
-                        // to retry repeatedly.
-                        const LOG_UP_TO_N_ERRORS: usize = 10;
-                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
-                            tracing::warn!(
-                                "DeleteObjects key {} failed: {}: {}",
-                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
-                            );
-                        }
-
-                        return Err(anyhow::format_err!(
-                            "Failed to delete {} objects",
-                            errors.len()
-                        ));
-                    }
-                }
-                Err(e) => {
-                    return Err(e.into());
-                }
-            }
-        }
-        Ok(())
+        self.delete_oids(kind, &delete_objects).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let paths = std::array::from_ref(path);
        self.delete_objects(paths).await
    }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        let kind = RequestKind::TimeTravel;
+        let _guard = self.permit(kind).await;
+
+        let timestamp = DateTime::from(timestamp);
+        let done_if_after = DateTime::from(done_if_after);
+
+        tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let prefix = prefix
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |_e: &_| false;
+
+        let mut key_marker = None;
+        let mut version_id_marker = None;
+        let mut versions_and_deletes = Vec::new();
+
+        loop {
+            let response = backoff::retry(
+                || async {
+                    Ok(self
+                        .client
+                        .list_object_versions()
+                        .bucket(self.bucket_name.clone())
+                        .set_prefix(prefix.clone())
+                        .set_key_marker(key_marker.clone())
+                        .set_version_id_marker(version_id_marker.clone())
+                        .send()
+                        .await?)
+                },
+                is_permanent,
+                warn_threshold,
+                max_retries,
+                "listing object versions for time_travel_recover",
+                backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+            )
+            .await?;
+
+            tracing::trace!(
+                "  Got List response version_id_marker={:?}, key_marker={:?}",
+                response.version_id_marker,
+                response.key_marker
+            );
+            let versions = response.versions.unwrap_or_default();
+            let delete_markers = response.delete_markers.unwrap_or_default();
+            let new_versions = versions.into_iter().map(VerOrDelete::Version);
+            let new_deletes = delete_markers.into_iter().map(VerOrDelete::DeleteMarker);
+            let new_versions_and_deletes = new_versions.chain(new_deletes);
+            versions_and_deletes.extend(new_versions_and_deletes);
+            fn none_if_empty(v: Option<String>) -> Option<String> {
+                v.filter(|v| !v.is_empty())
+            }
+            version_id_marker = none_if_empty(response.next_version_id_marker);
+            key_marker = none_if_empty(response.next_key_marker);
+            if version_id_marker.is_none() {
+                // The final response is not supposed to be truncated
+                if response.is_truncated.unwrap_or_default() {
+                    anyhow::bail!(
+                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
+                    );
+                }
+                break;
+            }
+        }
+
+        // Work on the list of references instead of the objects directly,
+        // otherwise we get lifetime errors in the sort_by_key call below.
+        let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
+
+        versions_and_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+
+        let mut vds_for_key = HashMap::<_, Vec<_>>::new();
+
+        for vd in &versions_and_deletes {
+            let last_modified = vd.last_modified();
+            let version_id = vd.version_id();
+            let key = vd.key();
+            let (Some(last_modified), Some(version_id), Some(key)) =
+                (last_modified, version_id, key)
+            else {
+                anyhow::bail!(
+                    "One (or more) of last_modified, key, and id is None. \
+                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
+                    last_modified, key, version_id,
+                );
+            };
+            if version_id == "null" {
+                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values");
+            }
+            tracing::trace!(
+                "Parsing version key={key} version_id={version_id} is_delete={}",
+                matches!(vd, VerOrDelete::DeleteMarker(_))
+            );
+
+            vds_for_key
+                .entry(key)
+                .or_default()
+                .push((vd, last_modified, version_id));
+        }
+        for (key, versions) in vds_for_key {
+            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
+            if last_last_modified > &&done_if_after {
+                tracing::trace!("Key {key} has version later than done_if_after, skipping");
+                continue;
+            }
+            // the version we want to restore to.
+            let version_to_restore_to =
+                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                    Ok(v) => v,
+                    Err(e) => e,
+                };
+            if version_to_restore_to == versions.len() {
+                tracing::trace!("Key {key} has no changes since timestamp, skipping");
+                continue;
+            }
+            let mut do_delete = false;
+            if version_to_restore_to == 0 {
+                // All versions more recent, so the key didn't exist at the specified time point.
+                tracing::trace!(
+                    "All {} versions more recent for {key}, deleting",
+                    versions.len()
+                );
+                do_delete = true;
+            } else {
+                match &versions[version_to_restore_to - 1] {
+                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                        tracing::trace!("Copying old version {version_id} for {key}...");
+                        // Restore the state to the last version by copying
+                        let source_id =
+                            format!("{}/{key}?versionId={version_id}", self.bucket_name);
+
+                        backoff::retry(
+                            || async {
+                                Ok(self
+                                    .client
+                                    .copy_object()
+                                    .bucket(self.bucket_name.clone())
+                                    .key(key)
+                                    .copy_source(&source_id)
+                                    .send()
+                                    .await?)
+                            },
+                            is_permanent,
+                            warn_threshold,
+                            max_retries,
+                            "listing object versions for time_travel_recover",
+                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                        )
+                        .await?;
+                    }
+                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                        do_delete = true;
+                    }
+                }
+            };
+            if do_delete {
+                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                    // Key has since been deleted (but there was some history), no need to do anything
+                    tracing::trace!("Key {key} already deleted, skipping.");
+                } else {
+                    tracing::trace!("Deleting {key}...");
+
+                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
+                    self.delete_oids(kind, &[oid]).await?;
+                }
+            }
+        }
+        Ok(())
+    }
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -651,6 +838,32 @@ fn start_measuring_requests(
    })
 }

+enum VerOrDelete {
+    Version(ObjectVersion),
+    DeleteMarker(DeleteMarkerEntry),
+}
+
+impl VerOrDelete {
+    fn last_modified(&self) -> Option<&DateTime> {
+        match self {
+            VerOrDelete::Version(v) => v.last_modified(),
+            VerOrDelete::DeleteMarker(v) => v.last_modified(),
+        }
+    }
+    fn version_id(&self) -> Option<&str> {
+        match self {
+            VerOrDelete::Version(v) => v.version_id(),
+            VerOrDelete::DeleteMarker(v) => v.version_id(),
+        }
+    }
+    fn key(&self) -> Option<&str> {
+        match self {
+            VerOrDelete::Version(v) => v.key(),
+            VerOrDelete::DeleteMarker(v) => v.key(),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use camino::Utf8Path;
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -12,6 +12,7 @@ pub(crate) enum RequestKind {
    Delete = 2,
    List = 3,
    Copy = 4,
+    TimeTravel = 5,
 }

 use RequestKind::*;
@@ -24,6 +25,7 @@ impl RequestKind {
            Delete => "delete_object",
            List => "list_objects",
            Copy => "copy_object",
+            TimeTravel => "time_travel_recover",
        }
    }
    const fn as_index(&self) -> usize {
@@ -31,7 +33,7 @@ impl RequestKind {
    }
 }

-pub(super) struct RequestTyped<C>([C; 5]);
+pub(super) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
    pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -40,8 +42,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy].into_iter();
-        let arr = std::array::from_fn::<C, 5, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
+        let arr = std::array::from_fn::<C, 6, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,16 +3,19 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
+use std::time::SystemTime;
+use std::{collections::hash_map::Entry, sync::Arc};
+use tokio_util::sync::CancellationToken;

 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
+    StorageMetadata,
 };

 pub struct UnreliableWrapper {
-    inner: crate::GenericRemoteStorage,
+    inner: GenericRemoteStorage<Arc<VoidStorage>>,

    // This many attempts of each operation will fail, then we let it succeed.
    attempts_to_fail: u64,
@@ -29,11 +32,21 @@ enum RemoteOp {
    Download(RemotePath),
    Delete(RemotePath),
    DeleteObjects(Vec<RemotePath>),
+    TimeTravelRecover(Option<RemotePath>),
 }

 impl UnreliableWrapper {
    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
        assert!(attempts_to_fail > 0);
+        let inner = match inner {
+            GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
+            GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
+            GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
+            // We could also make this a no-op, as in, extract the inner of the passed generic remote storage
+            GenericRemoteStorage::Unreliable(_s) => {
+                panic!("Can't wrap unreliable wrapper unreliably")
+            }
+        };
        UnreliableWrapper {
            inner,
            attempts_to_fail,
@@ -84,7 +97,9 @@ impl UnreliableWrapper {
    }
 }

-#[async_trait::async_trait]
+// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
+type VoidStorage = crate::LocalFs;
+
 impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
        &self,
@@ -169,4 +184,17 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Upload(to.clone()))?;
        self.inner.copy_object(from, to).await
    }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
+        self.inner
+            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
+            .await
+    }
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,15 +1,21 @@
-use std::collections::HashSet;
 use std::env;
+use std::fmt::{Debug, Display};
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::UNIX_EPOCH;
+use std::time::{Duration, UNIX_EPOCH};
+use std::{collections::HashSet, time::SystemTime};

+use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
+use camino::Utf8Path;
+use futures_util::Future;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
+use test_context::test_context;
 use test_context::AsyncTestContext;
+use tokio_util::sync::CancellationToken;
 use tracing::info;

 mod common;
@@ -18,11 +24,160 @@ mod common;
 mod tests_s3;

 use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
+use utils::backoff;

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+    // Our test depends on discrepancies in the clock between S3 and the environment the tests
+    // run in. Therefore, wait a little bit before and after. The alternative would be
+    // to take the time from S3 response headers.
+    const WAIT_TIME: Duration = Duration::from_millis(3_000);
+
+    async fn retry<T, O, F, E>(op: O) -> Result<T, E>
+    where
+        E: Display + Debug + 'static,
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, E>>,
+    {
+        let warn_threshold = 3;
+        let max_retries = 10;
+        backoff::retry(
+            op,
+            |_e| false,
+            warn_threshold,
+            max_retries,
+            "test retry",
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        )
+        .await
+    }
+
+    async fn time_point() -> SystemTime {
+        tokio::time::sleep(WAIT_TIME).await;
+        let ret = SystemTime::now();
+        tokio::time::sleep(WAIT_TIME).await;
+        ret
+    }
+
+    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
+        Ok(retry(|| client.list_files(None))
+            .await
+            .context("list root files failure")?
+            .into_iter()
+            .collect::<HashSet<_>>())
+    }
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+        ctx.client.upload(data, len, &path1, None)
+    })
+    .await?;
+
+    let t0_files = list_files(&ctx.client).await?;
+    let t0 = time_point().await;
+    println!("at t0: {t0_files:?}");
+
+    let old_data = "remote blob data2";
+
+    retry(|| {
+        let (data, len) = upload_stream(old_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
+
+    let t1_files = list_files(&ctx.client).await?;
+    let t1 = time_point().await;
+    println!("at t1: {t1_files:?}");
+
+    // A little check to ensure that our clock is not too far off from the S3 clock
+    {
+        let dl = retry(|| ctx.client.download(&path2)).await?;
+        let last_modified = dl.last_modified.unwrap();
+        let half_wt = WAIT_TIME.mul_f32(0.5);
+        let t0_hwt = t0 + half_wt;
+        let t1_hwt = t1 - half_wt;
+        if !(t0_hwt..=t1_hwt).contains(&last_modified) {
+            panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
+                This likely means a large lock discrepancy between S3 and the local clock.");
+        }
+    }
+
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+        ctx.client.upload(data, len, &path3, None)
+    })
+    .await?;
+
+    let new_data = "new remote blob data2";
+
+    retry(|| {
+        let (data, len) = upload_stream(new_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
+
+    retry(|| ctx.client.delete(&path1)).await?;
+    let t2_files = list_files(&ctx.client).await?;
+    let t2 = time_point().await;
+    println!("at t2: {t2_files:?}");
+
+    // No changes after recovery to t2 (no-op)
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t2, t_final, CancellationToken::new())
+        .await?;
+    let t2_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t2: {t2_files_recovered:?}");
+    assert_eq!(t2_files, t2_files_recovered);
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t2, new_data.as_bytes());
+
+    // after recovery to t1: path1 is back, path2 has the old content
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t1, t_final, CancellationToken::new())
+        .await?;
+    let t1_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t1: {t1_files_recovered:?}");
+    assert_eq!(t1_files, t1_files_recovered);
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t1, old_data.as_bytes());
+
+    // after recovery to t0: everything is gone except for path1
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t0, t_final, CancellationToken::new())
+        .await?;
+    let t0_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t0: {t0_files_recovered:?}");
+    assert_eq!(t0_files, t0_files_recovered);
+
+    // cleanup
+
+    let paths = &[path1, path2, path3];
+    retry(|| ctx.client.delete_objects(paths)).await?;
+
+    Ok(())
+}
+
 struct EnabledS3 {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
    borrow::Cow,
    fs::{self, File},
-    io::{self, Write},
+    io,
 };

 use camino::{Utf8Path, Utf8PathBuf};
@@ -112,48 +112,6 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
    tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }

-/// Writes a file to the specified `final_path` in a crash safe fasion
-///
-/// The file is first written to the specified tmp_path, and in a second
-/// step, the tmp path is renamed to the final path. As renames are
-/// atomic, a crash during the write operation will never leave behind a
-/// partially written file.
-///
-/// NB: an async variant of this code exists in Pageserver's VirtualFile.
-pub fn overwrite(
-    final_path: &Utf8Path,
-    tmp_path: &Utf8Path,
-    content: &[u8],
-) -> std::io::Result<()> {
-    let Some(final_path_parent) = final_path.parent() else {
-        return Err(std::io::Error::from_raw_os_error(
-            nix::errno::Errno::EINVAL as i32,
-        ));
-    };
-    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
-    let mut file = std::fs::OpenOptions::new()
-        .write(true)
-        // Use `create_new` so that, if we race with ourselves or something else,
-        // we bail out instead of causing damage.
-        .create_new(true)
-        .open(tmp_path)?;
-    file.write_all(content)?;
-    file.sync_all()?;
-    drop(file); // before the rename, that's important!
-                // renames are atomic
-    std::fs::rename(tmp_path, final_path)?;
-    // Only open final path parent dirfd now, so that this operation only
-    // ever holds one VirtualFile fd at a time.  That's important because
-    // the current `find_victim_slot` impl might pick the same slot for both
-    // VirtualFile., and it eventually does a blocking write lock instead of
-    // try_lock.
-    let final_parent_dirfd = std::fs::OpenOptions::new()
-        .read(true)
-        .open(final_path_parent)?;
-    final_parent_dirfd.sync_all()?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -131,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
-        _ => error!("Error processing HTTP request: {api_error:#}"),
+        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
+        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+        _ => info!("Error processing HTTP request: {api_error:#}"),
    }

    api_error.into_response()
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -61,6 +61,7 @@ sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
-use pageserver::virtual_file::VirtualFile;
+use pageserver::virtual_file::{self, VirtualFile};

 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10);
+    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(10);
+            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> {
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors);
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -36,6 +36,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
+use crate::virtual_file;
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
@@ -43,6 +44,8 @@ use crate::{

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;

+use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
+
 pub mod defaults {
    use crate::tenant::config::defaults::*;
    use const_format::formatcp;
@@ -79,6 +82,8 @@ pub mod defaults {

    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

+    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -114,6 +119,8 @@ pub mod defaults {

 #ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}

+#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -247,6 +254,8 @@ pub struct PageServerConf {

    /// Maximum number of WAL records to be ingested and committed at the same time
    pub ingest_batch_size: u64,
+
+    pub virtual_file_io_engine: virtual_file::IoEngineKind,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -331,6 +340,8 @@ struct PageServerConfigBuilder {
    secondary_download_concurrency: BuilderValue<usize>,

    ingest_batch_size: BuilderValue<u64>,
+
+    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 }

 impl Default for PageServerConfigBuilder {
@@ -406,6 +417,8 @@ impl Default for PageServerConfigBuilder {
            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),

            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
+
+            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
        }
    }
 }
@@ -562,6 +575,10 @@ impl PageServerConfigBuilder {
        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
    }

+    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
+        self.virtual_file_io_engine = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -669,6 +686,9 @@ impl PageServerConfigBuilder {
            ingest_batch_size: self
                .ingest_batch_size
                .ok_or(anyhow!("missing ingest_batch_size"))?,
+            virtual_file_io_engine: self
+                .virtual_file_io_engine
+                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
        })
    }
 }
@@ -920,6 +940,9 @@ impl PageServerConf {
                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
                },
                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
+                "virtual_file_io_engine" => {
+                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -993,6 +1016,7 @@ impl PageServerConf {
            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
        }
    }
 }
@@ -1225,6 +1249,7 @@ background_task_maximum_delay = '334 s'
                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1288,6 +1313,7 @@ background_task_maximum_delay = '334 s'
                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: 100,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -97,23 +97,86 @@ pub enum EvictionOrder {

    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
-    ///
-    /// This strategy will evict layers more fairly but is untested.
    RelativeAccessed {
-        #[serde(default)]
+        /// Determines if the tenant with most layers should lose first.
+        ///
+        /// Having this enabled is currently the only reasonable option, because the order in which
+        /// we read tenants is deterministic. If we find the need to use this as `false`, we need
+        /// to ensure nondeterminism by adding in a random number to break the
+        /// `relative_last_activity==0.0` ties.
+        #[serde(default = "default_highest_layer_count_loses_first")]
        highest_layer_count_loses_first: bool,
    },
 }

+fn default_highest_layer_count_loses_first() -> bool {
+    true
+}
+
 impl EvictionOrder {
-    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
-    /// counts should be the first ones to have their layers evicted.
-    fn highest_layer_count_loses_first(&self) -> bool {
+    fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
+        use EvictionOrder::*;
+
        match self {
-            EvictionOrder::AbsoluteAccessed => false,
-            EvictionOrder::RelativeAccessed {
+            AbsoluteAccessed => {
+                candidates.sort_unstable_by_key(|(partition, candidate)| {
+                    (*partition, candidate.last_activity_ts)
+                });
+            }
+            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            }),
+        }
+    }
+
+    /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
+    /// layers in **most** recently used order.
+    fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
+        use EvictionOrder::*;
+
+        match self {
+            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
+            RelativeAccessed {
                highest_layer_count_loses_first,
-            } => *highest_layer_count_loses_first,
+            } => {
+                // keeping the -1 or not decides if every tenant should lose their least recently accessed
+                // layer OR if this should happen in the order of having highest layer count:
+                let fudge = if *highest_layer_count_loses_first {
+                    // relative_last_activity vs. tenant layer count:
+                    // - 0.1..=1.0 (10 layers)
+                    // - 0.01..=1.0 (100 layers)
+                    // - 0.001..=1.0 (1000 layers)
+                    //
+                    // leading to evicting less of the smallest tenants.
+                    0
+                } else {
+                    // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+                    // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+                    // be that less than 10k layer evictions is enough, so we would not need to evict from
+                    // all tenants.
+                    //
+                    // as the tenant ordering is now deterministic this could hit the same tenants
+                    // disproportionetly on multiple invocations. alternative could be to remember how many
+                    // layers did we evict last time from this tenant, and inject that as an additional
+                    // fudge here.
+                    1
+                };
+
+                let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
+                let divider = total as f32;
+
+                // most recently used is always (total - 0) / divider == 1.0
+                // least recently used depends on the fudge:
+                // -       (total - 1) - (total - 1) / total => 0 / total
+                // -             total - (total - 1) / total => 1 / total
+                let distance = (total - index) as f32;
+
+                finite_f32::FiniteF32::try_from_normalized(distance / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            }
        }
    }
 }
@@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    let selection = select_victims(&candidates, usage_pre);

-    let mut candidates = candidates;
-
-    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
-        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
-        // for comparison here. this is a temporary measure to develop alternatives.
-        use std::fmt::Write;
-
-        let mut summary_buf = String::with_capacity(256);
-
-        {
-            let absolute_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{absolute_summary}").expect("string grows");
-
-            info!("absolute accessed selection summary: {summary_buf}");
-        }
-
-        candidates.sort_unstable_by_key(|(partition, candidate)| {
-            (*partition, candidate.relative_last_activity)
-        });
-
-        let selection = select_victims(&candidates, usage_pre);
-
-        {
-            summary_buf.clear();
-
-            let relative_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{relative_summary}").expect("string grows");
-
-            info!("relative accessed selection summary: {summary_buf}");
-        }
-
-        selection
-    } else {
-        selection
-    };
-
    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();

    // phase2: evict layers
@@ -835,54 +852,12 @@ async fn collect_eviction_candidates(
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;

-        // keeping the -1 or not decides if every tenant should lose their least recently accessed
-        // layer OR if this should happen in the order of having highest layer count:
-        let fudge = if eviction_order.highest_layer_count_loses_first() {
-            // relative_age vs. tenant layer count:
-            // - 0.1..=1.0 (10 layers)
-            // - 0.01..=1.0 (100 layers)
-            // - 0.001..=1.0 (1000 layers)
-            //
-            // leading to evicting less of the smallest tenants.
-            0
-        } else {
-            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
-            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
-            // be that less than 10k layer evictions is enough, so we would not need to evict from
-            // all tenants.
-            //
-            // as the tenant ordering is now deterministic this could hit the same tenants
-            // disproportionetly on multiple invocations. alternative could be to remember how many
-            // layers did we evict last time from this tenant, and inject that as an additional
-            // fudge here.
-            1
-        };
-
-        let total = tenant_candidates
-            .len()
-            .checked_sub(fudge)
-            .filter(|&x| x > 0)
-            // support 0 or 1 resident layer tenants as well
-            .unwrap_or(1);
-        let divider = total as f32;
+        let total = tenant_candidates.len();

        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
            // as we iterate this reverse sorted list, the most recently accessed layer will always
            // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = if matches!(
-                eviction_order,
-                EvictionOrder::RelativeAccessed { .. }
-            ) {
-                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
-                // similarly for u16. unsure how it would help.
-                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
-                    .unwrap_or_else(|val| {
-                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
-                        finite_f32::FiniteF32::ZERO
-                    })
-            } else {
-                finite_f32::FiniteF32::ZERO
-            };
+            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);

            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
@@ -927,10 +902,7 @@ async fn collect_eviction_candidates(
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");

-    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
-    // will sort later by candidate.relative_last_activity to get compare evictions.
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+    eviction_order.sort(&mut candidates);

    Ok(EvictionCandidates::Finished(candidates))
 }
@@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 {
        }
    }

+    impl From<FiniteF32> for f32 {
+        fn from(value: FiniteF32) -> f32 {
+            value.0
+        }
+    }
+
    impl FiniteF32 {
        pub const ZERO: FiniteF32 = FiniteF32(0.0);

@@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 {
                Err(value)
            }
        }
-    }
-}

-mod summary {
-    use super::finite_f32::FiniteF32;
-    use super::{EvictionCandidate, LayerCount};
-    use pageserver_api::shard::TenantShardId;
-    use std::collections::{BTreeMap, HashMap};
-    use std::time::SystemTime;
-
-    #[derive(Debug, Default)]
-    pub(super) struct EvictionSummary {
-        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
-        total: LayerCount,
-
-        last_absolute: Option<SystemTime>,
-        last_relative: Option<FiniteF32>,
-    }
-
-    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
-        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
-            let mut summary = EvictionSummary::default();
-            for item in iter {
-                let counts = summary
-                    .evicted_per_tenant
-                    .entry(*item.layer.get_tenant_shard_id())
-                    .or_default();
-
-                let sz = item.layer.get_file_size();
-
-                counts.file_sizes += sz;
-                counts.count += 1;
-
-                summary.total.file_sizes += sz;
-                summary.total.count += 1;
-
-                summary.last_absolute = Some(item.last_activity_ts);
-                summary.last_relative = Some(item.relative_last_activity);
-            }
-
-            summary
-        }
-    }
-
-    struct SiBytesAmount(u64);
-
-    impl std::fmt::Display for SiBytesAmount {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            if self.0 < 1024 {
-                return write!(f, "{}B", self.0);
-            }
-
-            let mut tmp = self.0;
-            let mut ch = 0;
-            let suffixes = b"KMGTPE";
-
-            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
-                tmp /= 1024;
-                ch += 1;
-            }
-
-            let ch = suffixes[ch] as char;
-
-            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
-        }
-    }
-
-    impl std::fmt::Display for EvictionSummary {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            // wasteful, but it's for testing
-
-            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
-
-            for (tenant_shard_id, count) in &self.evicted_per_tenant {
-                sorted
-                    .entry(count.count)
-                    .or_default()
-                    .push((*tenant_shard_id, count.file_sizes));
-            }
-
-            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
-
-            writeln!(
-                f,
-                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
-                self.total.count, self.last_absolute, self.last_relative,
-            )?;
-
-            for (count, per_tenant) in sorted.iter().rev().take(10) {
-                write!(f, "- {count} layers: ")?;
-
-                if per_tenant.len() < 3 {
-                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
-                        if i > 0 {
-                            write!(f, ", ")?;
-                        }
-                        let bytes = SiBytesAmount(*bytes);
-                        write!(f, "{tenant_shard_id} ({bytes})")?;
-                    }
-                } else {
-                    let num_tenants = per_tenant.len();
-                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
-                    let total_bytes = SiBytesAmount(total_bytes);
-                    let layers = num_tenants * count;
-
-                    write!(
-                        f,
-                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
-                    )?;
-                }
-
-                writeln!(f)?;
-            }
-
-            if sorted.len() > 10 {
-                let (rem_count, rem_bytes) = sorted
-                    .iter()
-                    .rev()
-                    .map(|(count, per_tenant)| {
-                        (
-                            count,
-                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
-                        )
-                    })
-                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
-                let rem_bytes = SiBytesAmount(rem_bytes);
-                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
-            }
-
-            Ok(())
+        pub fn into_inner(self) -> f32 {
+            self.into()
        }
    }
 }
@@ -1336,3 +1187,40 @@ mod filesystem_level_usage {
        assert!(!usage.has_pressure());
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn relative_equal_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: false,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.0));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+
+    #[test]
+    fn relative_spare_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.1));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,6 +187,7 @@ impl From<TenantSlotUpsertError> for ApiError {
        match e {
            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
            MapState(e) => e.into(),
+            ShuttingDown(_) => ApiError::ShuttingDown,
        }
    }
 }
@@ -495,6 +496,10 @@ async fn timeline_create_handler(
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
+            Err(_) if tenant.cancel.is_cancelled() => {
+                // In case we get some ugly error type during shutdown, cast it into a clean 503.
+                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
+            }
            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
                json_response(StatusCode::CONFLICT, ())
            }
@@ -1257,19 +1262,9 @@ async fn tenant_create_handler(
    };
    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
-    if let res @ Err(_) = new_tenant
+    new_tenant
        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await
-    {
-        // This shouldn't happen because we just created the tenant directory
-        // in upsert_location, and there aren't any remote timelines
-        // to load, so, nothing can really fail during load.
-        // Don't do cleanup because we don't know how we got here.
-        // The tenant will likely be in `Broken` state and subsequent
-        // calls will fail.
-        res.context("created tenant failed to become active")
-            .map_err(ApiError::InternalServerError)?;
-    }
+        .await?;

    json_response(
        StatusCode::CREATED,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,4 @@
+#![recursion_limit = "300"]
 #![deny(clippy::undocumented_unsafe_blocks)]

 mod auth;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) struct GetVectoredLatency {
+    map: EnumMap<TaskKind, Option<Histogram>>,
+}
+
+impl GetVectoredLatency {
+    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
+    // cardinality of the metric.
+    const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
+
+    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
+        self.map[task_kind].as_ref()
+    }
+}
+
+pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
+        "pageserver_get_vectored_seconds",
+        "Time spent in get_vectored",
+        &["task_kind"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric");
+
+    GetVectoredLatency {
+        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
+            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+
+            if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
+                let task_kind = task_kind.into();
+                Some(inner.with_label_values(&[task_kind]))
+            } else {
+                None
+            }
+        })),
+    }
+});
+
 pub(crate) struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,
@@ -932,6 +969,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+#[cfg(not(test))]
 pub(crate) mod virtual_file_descriptor_cache {
    use super::*;

@@ -951,6 +989,20 @@ pub(crate) mod virtual_file_descriptor_cache {
    // ```
 }

+#[cfg(not(test))]
+pub(crate) mod virtual_file_io_engine {
+    use super::*;
+
+    pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
+        register_uint_gauge_vec!(
+            "pageserver_virtual_file_io_engine_kind",
+            "The configured io engine for VirtualFile",
+            &["kind"],
+        )
+        .unwrap()
+    });
+}
+
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
    global: Histogram,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -322,8 +322,8 @@ enum PageStreamError {
    Shutdown,

    /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error: {0}")]
-    Read(PageReconstructError),
+    #[error("Read error")]
+    Read(#[source] PageReconstructError),

    /// Ran out of time waiting for an LSN
    #[error("LSN timeout: {0}")]
@@ -332,11 +332,11 @@ enum PageStreamError {
    /// The entity required to serve the request (tenant or timeline) is not found,
    /// or is not found in a suitable state to serve a request.
    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
+    NotFound(Cow<'static, str>),

    /// Request asked for something that doesn't make sense, like an invalid LSN
    #[error("Bad request: {0}")]
-    BadRequest(std::borrow::Cow<'static, str>),
+    BadRequest(Cow<'static, str>),
 }

 impl From<PageReconstructError> for PageStreamError {
@@ -667,7 +667,10 @@ impl PageServerHandler {
                        // print the all details to the log with {:#}, but for the client the
                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
                        // here includes cancellation which is not an error.
-                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
                        PagestreamBeMessage::Error(PagestreamErrorResponse {
                            message: e.to_string(),
                        })
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -627,9 +627,15 @@ impl Tenant {
            deletion_queue_client,
        ));

+        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
+        // we shut down while attaching.
+        let Ok(attach_gate_guard) = tenant.gate.enter() else {
+            // We just created the Tenant: nothing else can have shut it down yet
+            unreachable!();
+        };
+
        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
-
        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
@@ -639,6 +645,8 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
+                let _gate_guard = attach_gate_guard;
+
                // Is this tenant being spawned as part of process startup?
                let starting_up = init_order.is_some();
                scopeguard::defer! {
@@ -813,7 +821,7 @@ impl Tenant {
                    SpawnMode::Create => None,
                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
                };
-                match tenant_clone.attach(preload, &ctx).await {
+                match tenant_clone.attach(preload, mode, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        if let Some(t)=  attach_timer {t.observe_duration();}
@@ -900,15 +908,20 @@ impl Tenant {
    async fn attach(
        self: &Arc<Tenant>,
        preload: Option<TenantPreload>,
+        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        failpoint_support::sleep_millis_async!("before-attaching-tenant");

-        let preload = match preload {
-            Some(p) => p,
-            None => {
+        let preload = match (preload, mode) {
+            (Some(p), _) => p,
+            (None, SpawnMode::Create) => TenantPreload {
+                deleting: false,
+                timelines: HashMap::new(),
+            },
+            (None, SpawnMode::Normal) => {
                // Deprecated dev mode: load from local disk state instead of remote storage
                // https://github.com/neondatabase/neon/issues/5624
                return self.load_local(ctx).await;
@@ -1683,9 +1696,13 @@ impl Tenant {
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        if !self.is_active() {
-            return Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "Cannot create timelines on inactive tenant"
-            )));
+            if matches!(self.current_state(), TenantState::Stopping { .. }) {
+                return Err(CreateTimelineError::ShuttingDown);
+            } else {
+                return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "Cannot create timelines on inactive tenant"
+                )));
+            }
        }

        let _gate = self
@@ -4035,7 +4052,7 @@ pub(crate) mod harness {
                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                    tenant
-                        .attach(Some(preload), ctx)
+                        .attach(Some(preload), SpawnMode::Normal, ctx)
                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,10 +5,10 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
-use std::ops::{Deref, DerefMut};
+use std::ops::Deref;

 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -39,6 +39,8 @@ pub enum BlockLease<'a> {
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
+    #[cfg(test)]
+    Vec(Vec<u8>),
 }

 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> {
            BlockLease::EphemeralFileMutableTail(v) => v,
            #[cfg(test)]
            BlockLease::Arc(v) => v.deref(),
+            #[cfg(test)]
+            BlockLease::Vec(v) => {
+                TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
+            }
        }
    }
 }
@@ -169,10 +175,14 @@ impl FileBlockReader {
    }

    /// Read a page from the underlying file into given buffer.
-    async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+    async fn fill_buffer(
+        &self,
+        buf: PageWriteGuard<'static>,
+        blkno: u32,
+    ) -> Result<PageWriteGuard<'static>, std::io::Error> {
        assert!(buf.len() == PAGE_SZ);
        self.file
-            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
            .await
    }
    /// Read a block.
@@ -196,9 +206,9 @@ impl FileBlockReader {
                )
            })? {
            ReadBufResult::Found(guard) => Ok(guard.into()),
-            ReadBufResult::NotFound(mut write_guard) => {
+            ReadBufResult::NotFound(write_guard) => {
                // Read the page from disk into the buffer
-                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum).await?;
                Ok(write_guard.mark_valid().into())
            }
        }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -409,7 +409,10 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant.attach(preload, ctx).await.context("attach")?;
+        tenant
+            .attach(preload, super::SpawnMode::Normal, ctx)
+            .await
+            .context("attach")?;

        Self::background(
            guard,
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
-use std::fs::OpenOptions;
+
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
@@ -47,7 +47,10 @@ impl EphemeralFile {

        let file = VirtualFile::open_with_options(
            &filename,
-            OpenOptions::new().read(true).write(true).create(true),
+            virtual_file::OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create(true),
        )
        .await?;

@@ -89,11 +92,10 @@ impl EphemeralFile {
                page_cache::ReadBufResult::Found(guard) => {
                    return Ok(BlockLease::PageReadGuard(guard))
                }
-                page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                    let buf: &mut [u8] = write_guard.deref_mut();
-                    debug_assert_eq!(buf.len(), PAGE_SZ);
-                    self.file
-                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                page_cache::ReadBufResult::NotFound(write_guard) => {
+                    let write_guard = self
+                        .file
+                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
                        .await?;
                    let read_guard = write_guard.mark_valid();
                    return Ok(BlockLease::PageReadGuard(read_guard));
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use std::collections::VecDeque;
+use pageserver_api::keyspace::KeySpaceAccum;
+use std::cmp::Ordering;
+use std::collections::{BTreeMap, VecDeque};
+use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
@@ -144,11 +147,221 @@ impl Drop for BatchedUpdates<'_> {
 }

 /// Return value of LayerMap::search
+#[derive(Eq, PartialEq, Debug)]
 pub struct SearchResult {
    pub layer: Arc<PersistentLayerDesc>,
    pub lsn_floor: Lsn,
 }

+pub struct OrderedSearchResult(SearchResult);
+
+impl Ord for OrderedSearchResult {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.lsn_floor.cmp(&other.0.lsn_floor)
+    }
+}
+
+impl PartialOrd for OrderedSearchResult {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for OrderedSearchResult {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.lsn_floor == other.0.lsn_floor
+    }
+}
+
+impl Eq for OrderedSearchResult {}
+
+pub struct RangeSearchResult {
+    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub not_found: KeySpaceAccum,
+}
+
+impl RangeSearchResult {
+    fn new() -> Self {
+        Self {
+            found: BTreeMap::new(),
+            not_found: KeySpaceAccum::new(),
+        }
+    }
+}
+
+/// Collector for results of range search queries on the LayerMap.
+/// It should be provided with two iterators for the delta and image coverage
+/// that contain all the changes for layers which intersect the range.
+struct RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    delta_coverage: Peekable<Iter>,
+    image_coverage: Peekable<Iter>,
+    key_range: Range<Key>,
+    end_lsn: Lsn,
+
+    current_delta: Option<Arc<PersistentLayerDesc>>,
+    current_image: Option<Arc<PersistentLayerDesc>>,
+
+    result: RangeSearchResult,
+}
+
+#[derive(Debug)]
+enum NextLayerType {
+    Delta(i128),
+    Image(i128),
+    Both(i128),
+}
+
+impl NextLayerType {
+    fn next_change_at_key(&self) -> Key {
+        match self {
+            NextLayerType::Delta(at) => Key::from_i128(*at),
+            NextLayerType::Image(at) => Key::from_i128(*at),
+            NextLayerType::Both(at) => Key::from_i128(*at),
+        }
+    }
+}
+
+impl<Iter> RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    fn new(
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+        delta_coverage: Iter,
+        image_coverage: Iter,
+    ) -> Self {
+        Self {
+            delta_coverage: delta_coverage.peekable(),
+            image_coverage: image_coverage.peekable(),
+            key_range,
+            end_lsn,
+            current_delta: None,
+            current_image: None,
+            result: RangeSearchResult::new(),
+        }
+    }
+
+    /// Run the collector. Collection is implemented via a two pointer algorithm.
+    /// One pointer tracks the start of the current range and the other tracks
+    /// the beginning of the next range which will overlap with the next change
+    /// in coverage across both image and delta.
+    fn collect(mut self) -> RangeSearchResult {
+        let next_layer_type = self.choose_next_layer_type();
+        let mut current_range_start = match next_layer_type {
+            None => {
+                // No changes for the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
+                // Changes only after the end of the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) => {
+                // Changes for the range exist. Record anything before the first
+                // coverage change as not found.
+                let coverage_start = layer_type.next_change_at_key();
+                let range_before = self.key_range.start..coverage_start;
+                self.pad_range(range_before);
+
+                self.advance(&layer_type);
+                coverage_start
+            }
+        };
+
+        while current_range_start < self.key_range.end {
+            let next_layer_type = self.choose_next_layer_type();
+            match next_layer_type {
+                Some(t) => {
+                    let current_range_end = t.next_change_at_key();
+                    self.add_range(current_range_start..current_range_end);
+                    current_range_start = current_range_end;
+
+                    self.advance(&t);
+                }
+                None => {
+                    self.add_range(current_range_start..self.key_range.end);
+                    current_range_start = self.key_range.end;
+                }
+            }
+        }
+
+        self.result
+    }
+
+    /// Mark a range as not found (i.e. no layers intersect it)
+    fn pad_range(&mut self, key_range: Range<Key>) {
+        if !key_range.is_empty() {
+            self.result.not_found.add_range(key_range);
+        }
+    }
+
+    /// Select the appropiate layer for the given range and update
+    /// the collector.
+    fn add_range(&mut self, covered_range: Range<Key>) {
+        let selected = LayerMap::select_layer(
+            self.current_delta.clone(),
+            self.current_image.clone(),
+            self.end_lsn,
+        );
+
+        match selected {
+            Some(search_result) => self
+                .result
+                .found
+                .entry(OrderedSearchResult(search_result))
+                .or_default()
+                .add_range(covered_range),
+            None => self.pad_range(covered_range),
+        }
+    }
+
+    /// Move to the next coverage change.
+    fn advance(&mut self, layer_type: &NextLayerType) {
+        match layer_type {
+            NextLayerType::Delta(_) => {
+                let (_, layer) = self.delta_coverage.next().unwrap();
+                self.current_delta = layer;
+            }
+            NextLayerType::Image(_) => {
+                let (_, layer) = self.image_coverage.next().unwrap();
+                self.current_image = layer;
+            }
+            NextLayerType::Both(_) => {
+                let (_, image_layer) = self.image_coverage.next().unwrap();
+                let (_, delta_layer) = self.delta_coverage.next().unwrap();
+
+                self.current_image = image_layer;
+                self.current_delta = delta_layer;
+            }
+        }
+    }
+
+    /// Pick the next coverage change: the one at the lesser key or both if they're alligned.
+    fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
+        let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
+        let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
+
+        match (next_delta_at, next_image_at) {
+            (None, None) => None,
+            (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
+            (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
+            (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
+                Some(NextLayerType::Image(*next_image_at))
+            }
+            (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
+                Some(NextLayerType::Delta(*next_delta_at))
+            }
+            (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
+        }
+    }
+}
+
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -186,7 +399,18 @@ impl LayerMap {
        let latest_delta = version.delta_coverage.query(key.to_i128());
        let latest_image = version.image_coverage.query(key.to_i128());

-        match (latest_delta, latest_image) {
+        Self::select_layer(latest_delta, latest_image, end_lsn)
+    }
+
+    fn select_layer(
+        delta_layer: Option<Arc<PersistentLayerDesc>>,
+        image_layer: Option<Arc<PersistentLayerDesc>>,
+        end_lsn: Lsn,
+    ) -> Option<SearchResult> {
+        assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
+        assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
+
+        match (delta_layer, image_layer) {
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
@@ -223,6 +447,17 @@ impl LayerMap {
        }
    }

+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+
+        let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
+        let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
+        let image_changes = version.image_coverage.range_overlaps(&raw_range);
+
+        let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
+        Some(collector.collect())
+    }
+
    /// Start a batch of updates, applied on drop
    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
        BatchedUpdates { layer_map: self }
@@ -631,3 +866,126 @@ impl LayerMap {
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Clone)]
+    struct LayerDesc {
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        is_delta: bool,
+    }
+
+    fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
+        let mut layer_map = LayerMap::default();
+
+        for layer in layers {
+            layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
+                layer.key_range,
+                layer.lsn_range,
+                layer.is_delta,
+            ));
+        }
+
+        layer_map.flush_updates();
+        layer_map
+    }
+
+    fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
+        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
+        let lhs: Vec<_> = lhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+        let rhs: Vec<_> = rhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+
+        assert_eq!(lhs, rhs);
+    }
+
+    fn brute_force_range_search(
+        layer_map: &LayerMap,
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+    ) -> RangeSearchResult {
+        let mut range_search_result = RangeSearchResult::new();
+
+        let mut key = key_range.start;
+        while key != key_range.end {
+            let res = layer_map.search(key, end_lsn);
+            match res {
+                Some(res) => {
+                    range_search_result
+                        .found
+                        .entry(OrderedSearchResult(res))
+                        .or_default()
+                        .add_key(key);
+                }
+                None => {
+                    range_search_result.not_found.add_key(key);
+                }
+            }
+
+            key = key.next();
+        }
+
+        range_search_result
+    }
+
+    #[test]
+    fn ranged_search_on_empty_layer_map() {
+        let layer_map = LayerMap::default();
+        let range = Key::from_i128(100)..Key::from_i128(200);
+
+        let res = layer_map.range_search(range, Lsn(100));
+        assert!(res.is_none());
+    }
+
+    #[test]
+    fn ranged_search() {
+        let layers = vec![
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(50),
+                lsn_range: Lsn(0)..Lsn(5),
+                is_delta: false,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(10)..Key::from_i128(20),
+                lsn_range: Lsn(5)..Lsn(20),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(25),
+                lsn_range: Lsn(20)..Lsn(30),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(25)..Lsn(35),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(35)..Lsn(40),
+                is_delta: false,
+            },
+        ];
+
+        let layer_map = create_layer_map(layers.clone());
+        for start in 0..60 {
+            for end in (start + 1)..60 {
+                let range = Key::from_i128(start)..Key::from_i128(end);
+                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+
+                assert_range_search_result_eq(result, expected);
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -129,6 +129,42 @@ impl<Value: Clone> LayerCoverage<Value> {
            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
    }

+    /// Returns an iterator which includes all coverage changes for layers that intersect
+    /// with the provided range.
+    pub fn range_overlaps(
+        &self,
+        key_range: &Range<i128>,
+    ) -> impl Iterator<Item = (i128, Option<Value>)> + '_
+    where
+        Value: Eq,
+    {
+        let first_change = self.query(key_range.start);
+        match first_change {
+            Some(change) => {
+                // If the start of the range is covered, we have to deal with two cases:
+                // 1. Start of the range is aligned with the start of a layer.
+                // In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
+                // We advance said iterator to avoid duplicating the first change.
+                // 2. Start of the range is not aligned with the start of a layer.
+                let range = key_range.start..key_range.end;
+                let mut range_coverage = self.range(range).peekable();
+                if range_coverage
+                    .peek()
+                    .is_some_and(|c| c.1.as_ref() == Some(&change))
+                {
+                    range_coverage.next();
+                }
+                itertools::Either::Left(
+                    std::iter::once((key_range.start, Some(change))).chain(range_coverage),
+                )
+            }
+            None => {
+                let range = key_range.start..key_range.end;
+                let coverage = self.range(range);
+                itertools::Either::Right(coverage)
+            }
+        }
+    }
    /// O(1) clone
    pub fn clone(&self) -> Self {
        Self {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
@@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
-    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
+    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
+    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
@@ -466,6 +468,26 @@ pub async fn init_tenant_mgr(
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
            if let Some(gen) = generations.get(&tenant_shard_id) {
+                if let LocationMode::Attached(attached) = &location_conf.mode {
+                    if attached.generation > *gen {
+                        tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            attached.generation
+                        );
+
+                        // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
+                        // local disk content: demote to secondary rather than detaching.
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
+                                &SecondaryLocationConfig { warm: false },
+                            )),
+                        );
+                    }
+                }
                *gen
            } else {
                match &location_conf.mode {
@@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
        tokio::select! {
            Some(joined) = join_set.join_next() => {
                match joined {
-                    Ok(()) => {}
+                    Ok(()) => {},
                    Err(join_error) if join_error.is_cancelled() => {
                        unreachable!("we are not cancelling any of the tasks");
                    }
@@ -882,7 +904,7 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        new_location_config: LocationConf,
        flush: Option<Duration>,
-        spawn_mode: SpawnMode,
+        mut spawn_mode: SpawnMode,
        ctx: &RequestContext,
    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
        debug_assert_current_span_has_tenant_id();
@@ -902,19 +924,29 @@ impl TenantManager {
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
            match (&new_location_config.mode, peek_slot) {
                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                    if attach_conf.generation == tenant.generation {
-                        // A transition from Attached to Attached in the same generation, we may
-                        // take our fast path and just provide the updated configuration
-                        // to the tenant.
-                        tenant.set_new_location_config(
-                            AttachedTenantConf::try_from(new_location_config.clone())
-                                .map_err(UpsertLocationError::BadRequest)?,
-                        );
+                    match attach_conf.generation.cmp(&tenant.generation) {
+                        Ordering::Equal => {
+                            // A transition from Attached to Attached in the same generation, we may
+                            // take our fast path and just provide the updated configuration
+                            // to the tenant.
+                            tenant.set_new_location_config(
+                                AttachedTenantConf::try_from(new_location_config.clone())
+                                    .map_err(UpsertLocationError::BadRequest)?,
+                            );

-                        Some(FastPathModified::Attached(tenant.clone()))
-                    } else {
-                        // Different generations, fall through to general case
-                        None
+                            Some(FastPathModified::Attached(tenant.clone()))
+                        }
+                        Ordering::Less => {
+                            return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
+                                "Generation {:?} is less than existing {:?}",
+                                attach_conf.generation,
+                                tenant.generation
+                            )));
+                        }
+                        Ordering::Greater => {
+                            // Generation advanced, fall through to general case of replacing `Tenant` object
+                            None
+                        }
                    }
                }
                (
@@ -1019,6 +1051,12 @@ impl TenantManager {
                    }
                }
                slot_guard.drop_old_value().expect("We just shut it down");
+
+                // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
+                // the caller thinks they're creating but the tenant already existed.  We must switch to
+                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // rather than assuming it to be empty.
+                spawn_mode = SpawnMode::Normal;
            }
            Some(TenantSlot::Secondary(state)) => {
                info!("Shutting down secondary tenant");
@@ -1102,14 +1140,46 @@ impl TenantManager {
            None
        };

-        slot_guard.upsert(new_slot).map_err(|e| match e {
-            TenantSlotUpsertError::InternalError(e) => {
-                UpsertLocationError::Other(anyhow::anyhow!(e))
+        match slot_guard.upsert(new_slot) {
+            Err(TenantSlotUpsertError::InternalError(e)) => {
+                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
            }
-            TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
-        })?;
+            Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
+            Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
+                // If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
+                // we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
+                // are shutdown.
+                //
+                // We must shut it down inline here.
+                match new_slot {
+                    TenantSlot::InProgress(_) => {
+                        // Unreachable because we never insert an InProgress
+                        unreachable!()
+                    }
+                    TenantSlot::Attached(tenant) => {
+                        let (_guard, progress) = utils::completion::channel();
+                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
+                        match tenant.shutdown(progress, false).await {
+                            Ok(()) => {
+                                info!("Finished shutting down just-spawned tenant");
+                            }
+                            Err(barrier) => {
+                                info!("Shutdown already in progress, waiting for it to complete");
+                                barrier.wait().await;
+                            }
+                        }
+                    }
+                    TenantSlot::Secondary(secondary_tenant) => {
+                        secondary_tenant.shutdown().await;
+                    }
+                }

-        Ok(attached_tenant)
+                Err(UpsertLocationError::Unavailable(
+                    TenantMapError::ShuttingDown,
+                ))
+            }
+            Ok(()) => Ok(attached_tenant),
+        }
    }

    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1728,14 +1798,31 @@ pub(crate) enum TenantSlotError {

 /// Superset of TenantMapError: issues that can occur when using a SlotGuard
 /// to insert a new value.
-#[derive(Debug, thiserror::Error)]
-pub enum TenantSlotUpsertError {
+#[derive(thiserror::Error)]
+pub(crate) enum TenantSlotUpsertError {
    /// An error where the slot is in an unexpected state, indicating a code bug
    #[error("Internal error updating Tenant")]
    InternalError(Cow<'static, str>),

    #[error(transparent)]
-    MapState(#[from] TenantMapError),
+    MapState(TenantMapError),
+
+    // If we encounter TenantManager shutdown during upsert, we must carry the Completion
+    // from the SlotGuard, so that the caller can hold it while they clean up: otherwise
+    // TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
+    // was protected by the SlotGuard.
+    #[error("Shutting down")]
+    ShuttingDown((TenantSlot, utils::completion::Completion)),
+}
+
+impl std::fmt::Debug for TenantSlotUpsertError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
+            Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
+            Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
+        }
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1784,7 +1871,7 @@ pub struct SlotGuard {

    /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
    /// release any waiters as soon as this SlotGuard is dropped.
-    _completion: utils::completion::Completion,
+    completion: utils::completion::Completion,
 }

 impl SlotGuard {
@@ -1797,7 +1884,7 @@ impl SlotGuard {
            tenant_shard_id,
            old_value,
            upserted: false,
-            _completion: completion,
+            completion,
        }
    }

@@ -1830,9 +1917,16 @@ impl SlotGuard {
            }

            let m = match &mut *locked {
-                TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
+                TenantsMap::Initializing => {
+                    return Err(TenantSlotUpsertError::MapState(
+                        TenantMapError::StillInitializing,
+                    ))
+                }
                TenantsMap::ShuttingDown(_) => {
-                    return Err(TenantMapError::ShuttingDown.into());
+                    return Err(TenantSlotUpsertError::ShuttingDown((
+                        new_value,
+                        self.completion.clone(),
+                    )));
                }
                TenantsMap::Open(m) => m,
            };
@@ -1880,7 +1974,9 @@ impl SlotGuard {
                Err(TenantSlotUpsertError::InternalError(_)) => {
                    // We already logged the error, nothing else we can do.
                }
-                Err(TenantSlotUpsertError::MapState(_)) => {
+                Err(
+                    TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
+                ) => {
                    // If the map is shutting down, we need not replace anything
                }
                Ok(()) => {}
@@ -1978,18 +2074,22 @@ fn tenant_map_peek_slot<'a>(
    tenant_shard_id: &TenantShardId,
    mode: TenantSlotPeekMode,
 ) -> Result<Option<&'a TenantSlot>, TenantMapError> {
-    let m = match tenants.deref() {
-        TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
+    match tenants.deref() {
+        TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
        TenantsMap::ShuttingDown(m) => match mode {
-            TenantSlotPeekMode::Read => m,
-            TenantSlotPeekMode::Write => {
-                return Err(TenantMapError::ShuttingDown);
-            }
+            TenantSlotPeekMode::Read => Ok(Some(
+                // When reading in ShuttingDown state, we must translate None results
+                // into a ShuttingDown error, because absence of a tenant shard ID in the map
+                // isn't a reliable indicator of the tenant being gone: it might have been
+                // InProgress when shutdown started, and cleaned up from that state such
+                // that it's now no longer in the map.  Callers will have to wait until
+                // we next start up to get a proper answer.  This avoids incorrect 404 API responses.
+                m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
+            )),
+            TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
        },
-        TenantsMap::Open(m) => m,
-    };
-
-    Ok(m.get(tenant_shard_id))
+        TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
+    }
 }

 enum TenantSlotAcquireMode {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
@@ -649,7 +649,7 @@ impl DeltaLayer {
    {
        let file = VirtualFile::open_with_options(
            path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{
    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
@@ -327,7 +327,7 @@ impl ImageLayer {
    {
        let file = VirtualFile::open_with_options(
            path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -492,11 +492,15 @@ impl ImageLayerWriterInner {
            },
        );
        info!("new image layer {path}");
-        let mut file = VirtualFile::open_with_options(
-            &path,
-            std::fs::OpenOptions::new().write(true).create_new(true),
-        )
-        .await?;
+        let mut file = {
+            VirtualFile::open_with_options(
+                &path,
+                virtual_file::OpenOptions::new()
+                    .write(true)
+                    .create_new(true),
+            )
+            .await?
+        };
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
    }

    #[cfg(test)]
-    pub fn new_test(key_range: Range<Key>) -> Self {
+    pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
        Self {
            tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
            timeline_id: TimelineId::generate(),
            key_range,
-            lsn_range: Lsn(0)..Lsn(1),
-            is_delta: false,
+            lsn_range,
+            is_delta,
            file_size: 0,
        }
    }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    );
                    error_run_count += 1;
                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    log_compaction_error(
+                        &e,
+                        error_run_count,
+                        &wait_duration,
+                        cancel.is_cancelled(),
                    );
                    wait_duration
                } else {
@@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }

+fn log_compaction_error(
+    e: &CompactionError,
+    error_run_count: u32,
+    sleep_duration: &std::time::Duration,
+    task_cancelled: bool,
+) {
+    use crate::tenant::upload_queue::NotInitialized;
+    use crate::tenant::PageReconstructError;
+    use CompactionError::*;
+
+    enum LooksLike {
+        Info,
+        Error,
+    }
+
+    let decision = match e {
+        ShuttingDown => None,
+        _ if task_cancelled => Some(LooksLike::Info),
+        Other(e) => {
+            let root_cause = e.root_cause();
+
+            let is_stopping = {
+                let upload_queue = root_cause
+                    .downcast_ref::<NotInitialized>()
+                    .is_some_and(|e| e.is_stopping());
+
+                let timeline = root_cause
+                    .downcast_ref::<PageReconstructError>()
+                    .is_some_and(|e| e.is_stopping());
+
+                upload_queue || timeline
+            };
+
+            if is_stopping {
+                Some(LooksLike::Info)
+            } else {
+                Some(LooksLike::Error)
+            }
+        }
+    };
+
+    match decision {
+        Some(LooksLike::Info) => info!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
+        ),
+        Some(LooksLike::Error) => error!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
+        ),
+        None => {}
+    }
+}
+
 ///
 /// GC task's main loop
 ///
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -392,8 +392,7 @@ pub(crate) enum PageReconstructError {
    #[error("Ancestor LSN wait error: {0}")]
    AncestorLsnTimeout(#[from] WaitLsnError),

-    /// The operation was cancelled
-    #[error("Cancelled")]
+    #[error("timeline shutting down")]
    Cancelled,

    /// The ancestor of this is being stopped
@@ -405,6 +404,19 @@ pub(crate) enum PageReconstructError {
    WalRedo(anyhow::Error),
 }

+impl PageReconstructError {
+    /// Returns true if this error indicates a tenant/timeline shutdown alike situation
+    pub(crate) fn is_stopping(&self) -> bool {
+        use PageReconstructError::*;
+        match self {
+            Other(_) => false,
+            AncestorLsnTimeout(_) => false,
+            Cancelled | AncestorStopping(_) => true,
+            WalRedo(_) => false,
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 enum CreateImageLayersError {
    #[error("timeline shutting down")]
@@ -666,6 +678,10 @@ impl Timeline {
            return Err(GetVectoredError::Oversized(key_count));
        }

+        let _timer = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|t| t.start_timer());
+
        let mut values = BTreeMap::new();
        for range in key_ranges {
            let mut key = range.start;
@@ -4388,7 +4404,7 @@ impl Timeline {
                    .walredo_mgr
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                    .await
-                    .context("Failed to reconstruct a page image:")
+                    .context("reconstruct a page image")
                {
                    Ok(img) => img,
                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotInitialized {
+    #[error("queue is in state Uninitialized")]
+    Uninitialized,
+    #[error("queue is in state Stopping")]
+    Stopped,
+    #[error("queue is shutting down")]
+    ShuttingDown,
+}
+
+impl NotInitialized {
+    pub(crate) fn is_stopping(&self) -> bool {
+        use NotInitialized::*;
+        match self {
+            Uninitialized => false,
+            Stopped => true,
+            ShuttingDown => true,
+        }
+    }
+}
+
 impl UploadQueue {
    pub(crate) fn initialize_empty_remote(
        &mut self,
@@ -214,17 +235,17 @@ impl UploadQueue {
    }

    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        use UploadQueue::*;
        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => {
-                if !x.shutting_down {
-                    Ok(x)
+            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Initialized(x) => {
+                if x.shutting_down {
+                    Err(NotInitialized::ShuttingDown.into())
                } else {
-                    anyhow::bail!("queue is shutting down")
+                    Ok(x)
                }
            }
+            Stopped(_) => Err(NotInitialized::Stopped.into()),
        }
    }

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -11,18 +11,28 @@
 //! src/backend/storage/file/fd.c
 //!
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
+
+use crate::page_cache::PageWriteGuard;
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File, OpenOptions};
+use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+use tokio_epoll_uring::IoBufMut;
+
+use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;

+mod io_engine;
+mod open_options;
+pub use io_engine::IoEngineKind;
+pub(crate) use open_options::*;
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -106,7 +116,38 @@ struct SlotInner {
    tag: u64,

    /// the underlying file
-    file: Option<File>,
+    file: Option<OwnedFd>,
+}
+
+/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
+struct PageWriteGuardBuf {
+    page: PageWriteGuard<'static>,
+    init_up_to: usize,
+}
+// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
+// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
+unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
+    fn stable_ptr(&self) -> *const u8 {
+        self.page.as_ptr()
+    }
+    fn bytes_init(&self) -> usize {
+        self.init_up_to
+    }
+    fn bytes_total(&self) -> usize {
+        self.page.len()
+    }
+}
+// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
+// hence it's safe to hand out the `stable_mut_ptr()`.
+unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.page.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, pos: usize) {
+        assert!(pos <= self.page.len());
+        self.init_up_to = pos;
+    }
 }

 impl OpenFiles {
@@ -274,6 +315,10 @@ macro_rules! with_file {
        let $ident = $this.lock_file().await?;
        observe_duration!($op, $($body)*)
    }};
+    ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
+        let mut $ident = $this.lock_file().await?;
+        observe_duration!($op, $($body)*)
+    }};
 }

 impl VirtualFile {
@@ -326,7 +371,9 @@ impl VirtualFile {
        // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
        // where our caller doesn't get to use the returned VirtualFile before its
        // slot gets re-used by someone else.
-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = observe_duration!(StorageIoOperation::Open, {
+            open_options.open(path.as_std_path()).await?
+        });

        // Strip all options other than read and write.
        //
@@ -356,7 +403,12 @@ impl VirtualFile {
        Ok(vfile)
    }

-    /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
+    /// Writes a file to the specified `final_path` in a crash safe fasion
+    ///
+    /// The file is first written to the specified tmp_path, and in a second
+    /// step, the tmp path is renamed to the final path. As renames are
+    /// atomic, a crash during the write operation will never leave behind a
+    /// partially written file.
    pub async fn crashsafe_overwrite(
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
@@ -395,15 +447,13 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.sync_all()))
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.metadata()))
    }

    /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -412,7 +462,7 @@ impl VirtualFile {
    ///
    /// We are doing it via a macro as Rust doesn't support async closures that
    /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    async fn lock_file(&self) -> Result<FileGuard, Error> {
        let open_files = get_open_files();

        let mut handle_guard = {
@@ -458,10 +508,9 @@ impl VirtualFile {
        // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
        // case from StorageIoOperation::Open. This helps with identifying thrashing
        // of the virtual file descriptor cache.
-        let file = observe_duration!(
-            StorageIoOperation::OpenAfterReplace,
-            self.open_options.open(&self.path)
-        )?;
+        let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
+            self.open_options.open(self.path.as_std_path()).await?
+        });

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -486,9 +535,8 @@ impl VirtualFile {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
+                    .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
@@ -507,25 +555,28 @@ impl VirtualFile {
        Ok(self.pos)
    }

-    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-    pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
-        while !buf.is_empty() {
-            match self.read_at(buf, offset).await {
-                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::UnexpectedEof,
-                        "failed to fill whole buffer",
-                    ))
-                }
-                Ok(n) => {
-                    buf = &mut buf[n..];
-                    offset += n as u64;
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-        }
-        Ok(())
+    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) =
+            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        let buf = PageWriteGuardBuf {
+            page,
+            init_up_to: 0,
+        };
+        let res = self.read_exact_at(buf, offset).await;
+        res.map(|PageWriteGuardBuf { page, .. }| page)
+            .map_err(|e| Error::new(ErrorKind::Other, e))
    }

    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -575,22 +626,35 @@ impl VirtualFile {
        Ok(n)
    }

-    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+
+        observe_duration!(StorageIoOperation::Read, {
+            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
+            if let Ok(size) = res {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "read",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, res)
+        })
    }

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
+            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
+        });
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
@@ -600,18 +664,241 @@ impl VirtualFile {
    }
 }

-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
+// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
+pub async fn read_exact_at_impl<B, F, Fut>(
+    buf: B,
+    mut offset: u64,
+    mut read_at: F,
+) -> (B, std::io::Result<()>)
+where
+    B: IoBufMut + Send,
+    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
+    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
+{
+    use tokio_epoll_uring::BoundedBuf;
+    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    while buf.bytes_total() != 0 {
+        let res;
+        (buf, res) = read_at(buf, offset).await;
+        match res {
+            Ok(0) => break,
+            Ok(n) => {
+                buf = buf.slice(n..);
+                offset += n as u64;
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+            Err(e) => return (buf.into_inner(), Err(e)),
+        }
+    }
+    // NB: don't use `buf.is_empty()` here; it is from the
+    // `impl Deref for Slice { Target = [u8] }`; the the &[u8]
+    // returned by it only covers the initialized portion of `buf`.
+    // Whereas we're interested in ensuring that we filled the entire
+    // buffer that the user passed in.
+    if buf.bytes_total() != 0 {
+        (
+            buf.into_inner(),
+            Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                "failed to fill whole buffer",
+            )),
+        )
+    } else {
+        assert_eq!(buf.len(), buf.bytes_total());
+        (buf.into_inner(), Ok(()))
+    }
 }

-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
+#[cfg(test)]
+mod test_read_exact_at_impl {
+
+    use std::{collections::VecDeque, sync::Arc};
+
+    use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
+
+    use super::read_exact_at_impl;
+
+    struct Expectation {
+        offset: u64,
+        bytes_total: usize,
+        result: std::io::Result<Vec<u8>>,
+    }
+    struct MockReadAt {
+        expectations: VecDeque<Expectation>,
+    }
+
+    impl MockReadAt {
+        async fn read_at(
+            &mut self,
+            mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
+            offset: u64,
+        ) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
+            let exp = self
+                .expectations
+                .pop_front()
+                .expect("read_at called but we have no expectations left");
+            assert_eq!(exp.offset, offset);
+            assert_eq!(exp.bytes_total, buf.bytes_total());
+            match exp.result {
+                Ok(bytes) => {
+                    assert!(bytes.len() <= buf.bytes_total());
+                    buf.put_slice(&bytes);
+                    (buf, Ok(bytes.len()))
+                }
+                Err(e) => (buf, Err(e)),
+            }
+        }
+    }
+
+    impl Drop for MockReadAt {
+        fn drop(&mut self) {
+            assert_eq!(self.expectations.len(), 0);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_basic() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 5,
+                result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
+            }]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
+    }
+
+    #[tokio::test]
+    async fn test_empty_buf_issues_no_syscall() {
+        let buf = Vec::new();
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::new(),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_two_read_at_calls_needed_until_buf_filled() {
+        let buf = Vec::with_capacity(4);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 4,
+                    result: Ok(vec![b'a', b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 2,
+                    result: Ok(vec![b'c', b'd']),
+                },
+            ]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
+    }
+
+    #[tokio::test]
+    async fn test_eof_before_buffer_full() {
+        let buf = Vec::with_capacity(3);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 3,
+                    result: Ok(vec![b'a']),
+                },
+                Expectation {
+                    offset: 1,
+                    bytes_total: 2,
+                    result: Ok(vec![b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 1,
+                    result: Ok(vec![]),
+                },
+            ]),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        let Err(err) = res else {
+            panic!("should return an error");
+        };
+        assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
+        assert_eq!(format!("{err}"), "failed to fill whole buffer");
+        // buffer contents on error are unspecified
+    }
+}
+
+struct FileGuard {
+    slot_guard: RwLockReadGuard<'static, SlotInner>,
+}
+
+impl AsRef<OwnedFd> for FileGuard {
+    fn as_ref(&self) -> &OwnedFd {
        // This unwrap is safe because we only create `FileGuard`s
        // if we know that the file is Some.
        self.slot_guard.file.as_ref().unwrap()
    }
 }

+impl FileGuard {
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file<F, R>(&self, with: F) -> R
+    where
+        F: FnOnce(&File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
+        let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&file);
+        let _ = file.into_raw_fd();
+        res
+    }
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file_mut<F, R>(&mut self, with: F) -> R
+    where
+        F: FnOnce(&mut File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
+        let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&mut file);
+        let _ = file.into_raw_fd();
+        res
+    }
+}
+
+impl tokio_epoll_uring::IoFd for FileGuard {
+    unsafe fn as_fd(&self) -> RawFd {
+        let owned_fd: &OwnedFd = self.as_ref();
+        owned_fd.as_raw_fd()
+    }
+}
+
 #[cfg(test)]
 impl VirtualFile {
    pub(crate) async fn read_blk(
@@ -619,16 +906,19 @@ impl VirtualFile {
        blknum: u32,
    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
        use crate::page_cache::PAGE_SZ;
-        let mut buf = [0; PAGE_SZ];
-        self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
+        let buf = vec![0; PAGE_SZ];
+        let buf = self
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
            .await?;
-        Ok(std::sync::Arc::new(buf).into())
+        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
    }

    async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
        loop {
-            let mut tmp = [0; 128];
-            match self.read_at(&mut tmp, self.pos).await {
+            let res;
+            (tmp, res) = self.read_at(tmp, self.pos).await;
+            match res {
                Ok(0) => return Ok(()),
                Ok(n) => {
                    self.pos += n as u64;
@@ -704,10 +994,12 @@ impl OpenFiles {
 /// Initialize the virtual file module. This must be called once at page
 /// server startup.
 ///
-pub fn init(num_slots: usize) {
+#[cfg(not(test))]
+pub fn init(num_slots: usize, engine: IoEngineKind) {
    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
        panic!("virtual_file::init called twice");
    }
+    io_engine::init(engine);
    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }

@@ -752,10 +1044,10 @@ mod tests {
    }

    impl MaybeVirtualFile {
-        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
+        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
            match self {
                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
+                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
            }
        }
        async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
@@ -797,14 +1089,14 @@ mod tests {

        // Helper function to slurp a portion of a file into a string
        async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
-            let mut buf = vec![0; len];
-            self.read_exact_at(&mut buf, pos).await?;
+            let buf = vec![0; len];
+            let buf = self.read_exact_at(buf, pos).await?;
            Ok(String::from_utf8(buf).unwrap())
        }
    }

    #[tokio::test]
-    async fn test_virtual_files() -> Result<(), Error> {
+    async fn test_virtual_files() -> anyhow::Result<()> {
        // The real work is done in the test_files() helper function. This
        // allows us to run the same set of tests against a native File, and
        // VirtualFile. We trust the native Files and wouldn't need to test them,
@@ -820,14 +1112,17 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_physical_files() -> Result<(), Error> {
+    async fn test_physical_files() -> anyhow::Result<()> {
        test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File(open_options.open(path)?))
+            Ok(MaybeVirtualFile::File({
+                let owned_fd = open_options.open(path.as_std_path()).await?;
+                File::from(owned_fd)
+            }))
        })
        .await
    }

-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
+    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
    where
        OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
        FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
@@ -971,11 +1266,11 @@ mod tests {
        for _threadno in 0..THREADS {
            let files = files.clone();
            let hdl = rt.spawn(async move {
-                let mut buf = [0u8; SIZE];
+                let mut buf = vec![0u8; SIZE];
                let mut rng = rand::rngs::OsRng;
                for _ in 1..1000 {
                    let f = &files[rng.gen_range(0..files.len())];
-                    f.read_exact_at(&mut buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0).await.unwrap();
                    assert!(buf == SAMPLE);
                }
            });
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -0,0 +1,114 @@
+//! [`super::VirtualFile`] supports different IO engines.
+//!
+//! The [`IoEngineKind`] enum identifies them.
+//!
+//! The choice of IO engine is global.
+//! Initialize using [`init`].
+//!
+//! Then use [`get`] and  [`super::OpenOptions`].
+
+#[derive(
+    Copy,
+    Clone,
+    PartialEq,
+    Eq,
+    Hash,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+    Debug,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum IoEngineKind {
+    StdFs,
+    #[cfg(target_os = "linux")]
+    TokioEpollUring,
+}
+
+static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
+
+#[cfg(not(test))]
+pub(super) fn init(engine: IoEngineKind) {
+    if IO_ENGINE.set(engine).is_err() {
+        panic!("called twice");
+    }
+    crate::metrics::virtual_file_io_engine::KIND
+        .with_label_values(&[&format!("{engine}")])
+        .set(1);
+}
+
+pub(super) fn get() -> &'static IoEngineKind {
+    #[cfg(test)]
+    {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
+            Ok(v) => match v.parse::<IoEngineKind>() {
+                Ok(engine_kind) => engine_kind,
+                Err(e) => {
+                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                }
+            },
+            Err(std::env::VarError::NotPresent) => {
+                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                    .parse()
+                    .unwrap()
+            }
+            Err(std::env::VarError::NotUnicode(_)) => {
+                panic!("env var {env_var_name} is not unicode");
+            }
+        })
+    }
+    #[cfg(not(test))]
+    IO_ENGINE.get().unwrap()
+}
+
+use std::os::unix::prelude::FileExt;
+
+use super::FileGuard;
+
+impl IoEngineKind {
+    pub(super) async fn read_at<B>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        mut buf: B,
+    ) -> ((FileGuard, B), std::io::Result<usize>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        match self {
+            IoEngineKind::StdFs => {
+                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
+                let dst = unsafe {
+                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
+                };
+                let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
+                if let Ok(nbytes) = &res {
+                    assert!(*nbytes <= buf.bytes_total());
+                    // SAFETY: see above assertion
+                    unsafe {
+                        buf.set_init(*nbytes);
+                    }
+                }
+                #[allow(dropping_references)]
+                drop(dst);
+                ((file_guard, buf), res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.read(file_guard, offset, buf).await;
+                (
+                    resources,
+                    res.map_err(|e| match e {
+                        tokio_epoll_uring::Error::Op(e) => e,
+                        tokio_epoll_uring::Error::System(system) => {
+                            std::io::Error::new(std::io::ErrorKind::Other, system)
+                        }
+                    }),
+                )
+            }
+        }
+    }
+}
--- a/pageserver/src/virtual_file/open_options.rs
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -0,0 +1,138 @@
+//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
+
+use super::IoEngineKind;
+use std::{os::fd::OwnedFd, path::Path};
+
+#[derive(Debug, Clone)]
+pub enum OpenOptions {
+    StdFs(std::fs::OpenOptions),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions),
+}
+
+impl Default for OpenOptions {
+    fn default() -> Self {
+        match super::io_engine::get() {
+            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
+            }
+        }
+    }
+}
+
+impl OpenOptions {
+    pub fn new() -> OpenOptions {
+        Self::default()
+    }
+
+    pub fn read(&mut self, read: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.read(read);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.read(read);
+            }
+        }
+        self
+    }
+
+    pub fn write(&mut self, write: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.write(write);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.write(write);
+            }
+        }
+        self
+    }
+
+    pub fn create(&mut self, create: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create(create);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create(create);
+            }
+        }
+        self
+    }
+
+    pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create_new(create_new);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create_new(create_new);
+            }
+        }
+        self
+    }
+
+    pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.truncate(truncate);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.truncate(truncate);
+            }
+        }
+        self
+    }
+
+    pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
+        match self {
+            OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                system.open(path, x).await.map_err(|e| match e {
+                    tokio_epoll_uring::Error::Op(e) => e,
+                    tokio_epoll_uring::Error::System(system) => {
+                        std::io::Error::new(std::io::ErrorKind::Other, system)
+                    }
+                })
+            }
+        }
+    }
+}
+
+impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
+    fn mode(&mut self, mode: u32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.mode(mode);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.mode(mode);
+            }
+        }
+        self
+    }
+
+    fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.custom_flags(flags);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.custom_flags(flags);
+            }
+        }
+        self
+    }
+}
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -26,8 +26,11 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

+use std::str::FromStr;
+
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
+use hex::FromHex;
 use tracing::*;
 use utils::failpoint_support;

@@ -44,9 +47,10 @@ use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::v14::xlog_utils::*;
-use postgres_ffi::v14::CheckPoint;
+use postgres_ffi::v14::{bindings::FullTransactionId, CheckPoint};
 use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
+use utils::id::TenantId;
 use utils::lsn::Lsn;

 pub struct WalIngest {
@@ -109,6 +113,43 @@ impl WalIngest {
            self.checkpoint_modified = true;
        }

+        // BEGIN ONE-OFF HACK, version 3
+        //
+        // We had a bug where we incorrectly passed 0 to update_next_xid(). That was
+        // harmless as long as nextXid was < 2^31, because 0 looked like a very old
+        // XID. But once nextXid reaches 2^31, 0 starts to look like a very new XID, and
+        // we incorrectly bumped up nextXid to the next epoch, to value '1:1024'
+        //
+        // That bug was fixed in commits e4898a6e605e791a00ce21bf49d4cc0d9a10534a and
+        // c1148dc9acf938d912888ecb0a4e76ed40e21ef8, but we have one known timeline in
+        // production where that already happened. This is a one-off fix to fix that
+        // damage.
+        //
+        // So on that particular timeline, fix the incorrectly set nextXid to the XID from
+        // the next record we see, plus 10000 to give some safety margin.
+        //
+        // As a safety measure, disable this hack after they have reached LSN 380/00000000.
+        // As of this writing, they are around LSN 36D/00000000. They should not reach
+        // real XID wraparound until this LSN. We should remove this hack from production
+        // before that happens anyway, but better safe than sorry.
+        //
+        if self.checkpoint.nextXid.value == 4294968320 && // 1::1024, the incorrect value
+            // only apply this on the one broken tenant
+            modification.tline.tenant_shard_id.tenant_id == TenantId::from_hex("df254570a4f603805528b46b0d45a76c").unwrap() &&
+            lsn < Lsn::from_str("380/00000000").unwrap() &&
+            decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
+        {
+            self.checkpoint.nextXid = FullTransactionId {
+                value: (decoded.xl_xid + 10000) as u64,
+            };
+            self.checkpoint_modified = true;
+            warn!(
+                "nextXid fixed by one-off hack at LSN {}, nextXid is now {}",
+                lsn, self.checkpoint.nextXid.value
+            );
+        }
+        // END ONE-OFF HACK
+
        match decoded.xl_rmid {
            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                // Heap AM records need some special handling, because they modify VM pages
@@ -1033,7 +1074,23 @@ impl WalIngest {
            // Copy content
            debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
            for blknum in 0..nblocks {
-                debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
+                // Sharding:
+                //  - src and dst are always on the same shard, because they differ only by dbNode, and
+                //    dbNode is not included in the hash inputs for sharding.
+                //  - This WAL command is replayed on all shards, but each shard only copies the blocks
+                //    that belong to it.
+                let src_key = rel_block_to_key(src_rel, blknum);
+                if !self.shard.is_key_local(&src_key) {
+                    debug!(
+                        "Skipping non-local key {} during XLOG_DBASE_CREATE",
+                        src_key
+                    );
+                    continue;
+                }
+                debug!(
+                    "copying block {} from {} ({}) to {}",
+                    blknum, src_rel, src_key, dst_rel
+                );

                let content = modification
                    .tline
@@ -1347,16 +1404,22 @@ impl WalIngest {
            self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
            self.checkpoint_modified = true;
        }
-        let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-            if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-                mbr.xid
+        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
+            if let Some(max_xid) = acc {
+                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
+                    Some(mbr.xid)
+                } else {
+                    acc
+                }
            } else {
-                acc
+                Some(mbr.xid)
            }
        });

-        if self.checkpoint.update_next_xid(max_mbr_xid) {
-            self.checkpoint_modified = true;
+        if let Some(max_xid) = max_mbr_xid {
+            if self.checkpoint.update_next_xid(max_xid) {
+                self.checkpoint_modified = true;
+            }
        }
        Ok(())
    }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -15,6 +15,7 @@
 #include "postgres.h"

 #include "access/xlog.h"
+#include "common/hashfn.h"
 #include "fmgr.h"
 #include "libpq-fe.h"
 #include "libpq/libpq.h"
@@ -38,17 +39,6 @@
 #define MIN_RECONNECT_INTERVAL_USEC 1000
 #define MAX_RECONNECT_INTERVAL_USEC 1000000

-bool		connected = false;
-PGconn	   *pageserver_conn = NULL;
-
-/*
- * WaitEventSet containing:
- * - WL_SOCKET_READABLE on pageserver_conn,
- * - WL_LATCH_SET on MyLatch, and
- * - WL_EXIT_ON_PM_DEATH.
- */
-WaitEventSet *pageserver_conn_wes = NULL;
-
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
@@ -59,17 +49,25 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-static int n_reconnect_attempts = 0;
-static int max_reconnect_attempts = 60;
+static int	n_reconnect_attempts = 0;
+static int	max_reconnect_attempts = 60;
+static int	stripe_size;

-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+typedef struct
+{
+	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
+	size_t		num_shards;
+} ShardMap;

 /*
+ * PagestoreShmemState is kept in shared memory. It contains the connection
+ * strings for each shard.
+ *
 * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
 * allowing it to be changed using pg_reload_conf(). The control plane can
 * update the connection string if the pageserver crashes, is relocated, or
- * new shards are added. A copy of the current value of the GUC is kept in
- * shared memory, updated by the postmaster, because regular backends don't
+ * new shards are added. A parsed copy of the current value of the GUC is kept
+ * in shared memory, updated by the postmaster, because regular backends don't
 * reload the config during query execution, but we might need to re-establish
 * the pageserver connection with the new connection string even in the middle
 * of a query.
@@ -84,7 +82,7 @@ typedef struct
 {
 	pg_atomic_uint64 begin_update_counter;
 	pg_atomic_uint64 end_update_counter;
-	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+	ShardMap	shard_map;
 } PagestoreShmemState;

 #if PG_VERSION_NUM >= 150000
@@ -94,10 +92,25 @@ static void walproposer_shmem_request(void);
 static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];

-static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
+/* This backend's per-shard connections */
+typedef struct
+{
+	PGconn	   *conn;
+
+	/*---
+	 * WaitEventSet containing:
+	 * - WL_SOCKET_READABLE on 'conn'
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet *wes;
+} PageServer;
+
+static PageServer page_servers[MAX_SHARDS];
+
+static bool pageserver_flush(shardno_t shard_no);
+static void pageserver_disconnect(shardno_t shard_no);

 static bool
 PagestoreShmemIsValid(void)
@@ -105,89 +118,213 @@ PagestoreShmemIsValid(void)
 	return pagestore_shared && UsedShmemSegAddr;
 }

+/*
+ * Parse a comma-separated list of connection strings into a ShardMap.
+ *
+ * If 'result' is NULL, just checks that the input is valid. If the input is
+ * not valid, returns false. The contents of *result are undefined in
+ * that case, and must not be relied on.
+ */
+static bool
+ParseShardMap(const char *connstr, ShardMap *result)
+{
+	const char *p;
+	int			nshards = 0;
+
+	if (result)
+		memset(result, 0, sizeof(ShardMap));
+
+	p = connstr;
+	nshards = 0;
+	for (;;)
+	{
+		const char *sep;
+		size_t		connstr_len;
+
+		sep = strchr(p, ',');
+		connstr_len = sep != NULL ? sep - p : strlen(p);
+
+		if (connstr_len == 0 && sep == NULL)
+			break;				/* ignore trailing comma */
+
+		if (nshards >= MAX_SHARDS)
+		{
+			neon_log(LOG, "Too many shards");
+			return false;
+		}
+		if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE)
+		{
+			neon_log(LOG, "Connection string too long");
+			return false;
+		}
+		if (result)
+		{
+			memcpy(result->connstring[nshards], p, connstr_len);
+			result->connstring[nshards][connstr_len] = '\0';
+		}
+		nshards++;
+
+		if (sep == NULL)
+			break;
+		p = sep + 1;
+	}
+	if (result)
+		result->num_shards = nshards;
+
+	return true;
+}
+
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
-	return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+	char	   *p = *newval;
+
+	return ParseShardMap(p, NULL);
 }

 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
+	ShardMap	shard_map;
+
 	/*
 	 * Only postmaster updates the copy in shared memory.
 	 */
 	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
 		return;

-	pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
-	pg_write_barrier();
-	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_write_barrier();
-	pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
-}
-
-static bool
-CheckConnstringUpdated(void)
-{
-	if (!PagestoreShmemIsValid())
-		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+	if (!ParseShardMap(newval, &shard_map))
+	{
+		/*
+		 * shouldn't happen, because we already checked the value in
+		 * CheckPageserverConnstring
+		 */
+		elog(ERROR, "could not parse shard map");
+	}
+
+	if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0)
+	{
+		pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
+		pg_write_barrier();
+		memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap));
+		pg_write_barrier();
+		pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
+	}
+	else
+	{
+		/* no change */
+	}
 }

+/*
+ * Get the current number of shards, and/or the connection string for a
+ * particular shard from the shard map in shared memory.
+ *
+ * If num_shards_p is not NULL, it is set to the current number of shards.
+ *
+ * If connstr_p is not NULL, the connection string for 'shard_no' is copied to
+ * it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes
+ * long.
+ *
+ * As a side-effect, if the shard map in shared memory had changed since the
+ * last call, terminates all existing connections to all pageservers.
+ */
 static void
-ReloadConnstring(void)
+load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 {
 	uint64		begin_update_counter;
 	uint64		end_update_counter;
-
-	if (!PagestoreShmemIsValid())
-		return;
+	ShardMap   *shard_map = &pagestore_shared->shard_map;
+	shardno_t	num_shards;

 	/*
-	 * Copy the current settnig from shared to local memory. Postmaster can
-	 * update the value concurrently, in which case we would copy a garbled
-	 * mix of the old and new values. We will detect it because the counter's
-	 * won't match, and retry. But it's important that we don't do anything
-	 * within the retry-loop that would depend on the string having valid
-	 * contents.
+	 * Postmaster can update the shared memory values concurrently, in which
+	 * case we would copy a garbled mix of the old and new values. We will
+	 * detect it because the counter's won't match, and retry. But it's
+	 * important that we don't do anything within the retry-loop that would
+	 * depend on the string having valid contents.
 	 */
 	do
 	{
 		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
 		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
-		pg_read_barrier();

-		strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-		pg_read_barrier();
+		num_shards = shard_map->num_shards;
+		if (connstr_p && shard_no < MAX_SHARDS)
+			strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
+		pg_memory_barrier();
 	}
 	while (begin_update_counter != end_update_counter
 		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
 		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));

-	pagestore_local_counter = end_update_counter;
+	if (connstr_p && shard_no >= num_shards)
+		neon_log(ERROR, "Shard %d is greater or equal than number of shards %d",
+				 shard_no, num_shards);
+
+	/*
+	 * If any of the connection strings changed, reset all connections.
+	 */
+	if (pagestore_local_counter != end_update_counter)
+	{
+		for (shardno_t i = 0; i < MAX_SHARDS; i++)
+		{
+			if (page_servers[i].conn)
+				pageserver_disconnect(i);
+		}
+		pagestore_local_counter = end_update_counter;
+	}
+
+	if (num_shards_p)
+		*num_shards_p = num_shards;
+}
+
+#define MB (1024*1024)
+
+shardno_t
+get_shard_number(BufferTag *tag)
+{
+	shardno_t	n_shards;
+	uint32		hash;
+
+	load_shard_map(0, NULL, &n_shards);
+
+#if PG_MAJORVERSION_NUM < 16
+	hash = murmurhash32(tag->rnode.relNode);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#else
+	hash = murmurhash32(tag->relNumber);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#endif
+
+	return hash % n_shards;
 }

 static bool
-pageserver_connect(int elevel)
+pageserver_connect(shardno_t shard_no, int elevel)
 {
 	char	   *query;
 	int			ret;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
+	PGconn	   *conn;
+	WaitEventSet *wes;
+	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];

 	static TimestampTz last_connect_time = 0;
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
 	uint64_t	us_since_last_connect;

-	Assert(!connected);
+	Assert(page_servers[shard_no].conn == NULL);

-	if (CheckConnstringUpdated())
-	{
-		ReloadConnstring();
-	}
+	/*
+	 * Get the connection string for this shard. If the shard map has been
+	 * updated since we last looked, this will also disconnect any existing
+	 * pageserver connections as a side effect.
+	 */
+	load_shard_map(shard_no, connstr, NULL);

 	now = GetCurrentTimestamp();
 	us_since_last_connect = now - last_connect_time;
@@ -223,76 +360,84 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = connstr;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
 	n++;
-	pageserver_conn = PQconnectdbParams(keywords, values, 1);
+	conn = PQconnectdbParams(keywords, values, 1);

-	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (PQstatus(conn) == CONNECTION_BAD)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		char	   *msg = pchomp(PQerrorMessage(conn));

-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
+		PQfinish(conn);

 		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "could not establish connection to pageserver"),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
 				 errdetail_internal("%s", msg)));
+		pfree(msg);
 		return false;
 	}
-
 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-	ret = PQsendQuery(pageserver_conn, query);
+	ret = PQsendQuery(conn, query);
+	pfree(query);
 	if (ret != 1)
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		neon_log(elevel, "could not send pagestream command to pageserver");
+		PQfinish(conn);
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
 		return false;
 	}

-	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);

-	while (PQisBusy(pageserver_conn))
+	PG_TRY();
 	{
-		WaitEvent	event;
-
-		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-
-		CHECK_FOR_INTERRUPTS();
-
-		/* Data available in socket? */
-		if (event.events & WL_SOCKET_READABLE)
+		while (PQisBusy(conn))
 		{
-			if (!PQconsumeInput(pageserver_conn))
+			WaitEvent	event;
+
+			/* Sleep until there's something to do */
+			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			ResetLatch(MyLatch);
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* Data available in socket? */
+			if (event.events & WL_SOCKET_READABLE)
 			{
-				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				if (!PQconsumeInput(conn))
+				{
+					char	   *msg = pchomp(PQerrorMessage(conn));

-				PQfinish(pageserver_conn);
-				pageserver_conn = NULL;
-				FreeWaitEventSet(pageserver_conn_wes);
-				pageserver_conn_wes = NULL;
+					PQfinish(conn);
+					FreeWaitEventSet(wes);

-				neon_log(elevel, "could not complete handshake with pageserver: %s",
-						 msg);
-				return false;
+					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
+								   msg);
+					return false;
+				}
 			}
 		}
 	}
+	PG_CATCH();
+	{
+		PQfinish(conn);
+		FreeWaitEventSet(wes);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();

-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;

-	connected = true;
 	return true;
 }

@@ -300,9 +445,10 @@ pageserver_connect(int elevel)
 * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
 */
 static int
-call_PQgetCopyData(char **buffer)
+call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -312,7 +458,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -324,7 +470,7 @@ retry:
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
 			}
@@ -338,7 +484,7 @@ retry:


 static void
-pageserver_disconnect(void)
+pageserver_disconnect(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -347,38 +493,38 @@ pageserver_disconnect(void)
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
 	 */
-	if (connected)
+	if (page_servers[shard_no].conn)
 	{
-		neon_log(LOG, "dropping connection to page server due to error");
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;

+		/*
+		 * If the connection to any pageserver is lost, we throw away the
+		 * whole prefetch queue, even for other pageservers. It should not
+		 * cause big problems, because connection loss is supposed to be a
+		 * rare event.
+		 */
 		prefetch_on_ps_disconnect();
 	}
-	if (pageserver_conn_wes != NULL)
+	if (page_servers[shard_no].wes != NULL)
 	{
-		FreeWaitEventSet(pageserver_conn_wes);
-		pageserver_conn_wes = NULL;
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
 	}
 }

 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-
-	if (CheckConnstringUpdated())
-	{
-		pageserver_disconnect();
-		ReloadConnstring();
-	}
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
+		pageserver_disconnect(shard_no);
 	}

 	req_buff = nm_pack_request(request);
@@ -392,9 +538,9 @@ pageserver_send(NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
@@ -402,6 +548,8 @@ pageserver_send(NeonRequest *request)
 		n_reconnect_attempts = 0;
 	}

+	pageserver_conn = page_servers[shard_no].conn;
+
 	/*
 	 * Send request.
 	 *
@@ -414,8 +562,8 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-		pageserver_disconnect();
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -427,19 +575,20 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);

-		neon_log(PageStoreTrace, "sent request: %s", msg);
+		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
 	return true;
 }

 static NeonResponse *
-pageserver_receive(void)
+pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (!connected)
+	if (!pageserver_conn)
 		return NULL;

 	PG_TRY();
@@ -447,7 +596,7 @@ pageserver_receive(void)
 		/* read response */
 		int			rc;

-		rc = call_PQgetCopyData(&resp_buff.data);
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
 		if (rc >= 0)
 		{
 			resp_buff.len = rc;
@@ -459,33 +608,33 @@ pageserver_receive(void)
 			{
 				char	   *msg = nm_to_string((NeonMessage *) resp);

-				neon_log(PageStoreTrace, "got response: %s", msg);
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
 				pfree(msg);
 			}
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
-			pageserver_disconnect();
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			pageserver_disconnect(shard_no);
 			resp = NULL;
 		}
 		else if (rc == -2)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
 		else
 		{
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		pageserver_disconnect(shard_no);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -495,11 +644,13 @@ pageserver_receive(void)


 static bool
-pageserver_flush(void)
+pageserver_flush(shardno_t shard_no)
 {
-	if (!connected)
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+
+	if (!pageserver_conn)
 	{
-		neon_log(WARNING, "Tried to flush while disconnected");
+		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
 	else
 	{
@@ -507,8 +658,8 @@ pageserver_flush(void)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			pageserver_disconnect();
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
 			return false;
 		}
@@ -550,6 +701,7 @@ PagestoreShmemInit(void)
 	{
 		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
 		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
+		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -624,6 +776,15 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomIntVariable("neon.stripe_size",
+							"sharding stripe size",
+							NULL,
+							&stripe_size,
+							32768, 1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_BLOCKS,
+							NULL, NULL, NULL);
+
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -20,9 +20,13 @@
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 #include "storage/block.h"
+#include "storage/buf_internals.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"

+#define MAX_SHARDS 128
+#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -51,6 +55,9 @@ typedef struct
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
 										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

 /*
 * supertype of all the Neon*Request structs below
@@ -141,11 +148,13 @@ extern char *nm_to_string(NeonMessage *msg);
 * API
 */

+typedef unsigned shardno_t;
+
 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
-	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	NeonResponse *(*receive) (shardno_t shard_no);
+	bool		(*flush) (shardno_t shard_no);
 } page_server_api;

 extern void prefetch_on_ps_disconnect(void);
@@ -159,6 +168,8 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;

+extern shardno_t get_shard_number(BufferTag* tag);
+
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -172,6 +172,7 @@ typedef struct PrefetchRequest
 	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
+	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;

@@ -239,10 +240,17 @@ typedef struct PrefetchState
 								 * also unused */

 	/* the buffers */
-	prfh_hash  *prf_hash;
+	prfh_hash	*prf_hash;
+	int			max_shard_no;
+	/* Mark shards involved in prefetch */
+	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;

+#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
+#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
+#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
+
 static PrefetchState *MyPState;

 #define GetPrfSlot(ring_index) ( \
@@ -327,6 +335,7 @@ compact_prefetch_buffers(void)
 		Assert(target_slot->status == PRFS_UNUSED);

 		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
 		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -494,6 +503,23 @@ prefetch_cleanup_trailing_unused(void)
 	}
 }

+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
@@ -509,7 +535,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
@@ -547,7 +573,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->my_ring_index == MyPState->ring_receive);

 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive();
+	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
@@ -704,12 +730,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send((NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
 	MyPState->n_unused -= 1;
 	MyPState->ring_unused += 1;
+	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
@@ -880,6 +908,7 @@ Retry:
 	 * function reads the buffer tag from the slot.
 	 */
 	slot->buftag = tag;
+	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;

 	prefetch_do_request(slot, force_latest, force_lsn);
@@ -890,7 +919,7 @@ Retry:
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 		{
 			/*
 			 * Prefetch set is reset in case of error, so we should try to
@@ -908,13 +937,44 @@ static NeonResponse *
 page_server_request(void const *req)
 {
 	NeonResponse *resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (((NeonRequest *) req)->tag)
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+	}
+	shard_no = get_shard_number(&tag);
+
+
+	/*
+	 * Current sharding model assumes that all metadata is present only at shard 0.
+	 * We still need to call get_shard_no() to check if shard map is up-to-date.
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	{
+		shard_no = 0;
+	}

 	do
 	{
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
-		MyPState->ring_flush = MyPState->ring_unused;
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
 		consume_prefetch_responses();
-		resp = page_server->receive();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 	return resp;

@@ -2098,8 +2158,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							blkno,
+					 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,7 +3,6 @@ mod hacks;
 mod link;

 pub use link::LinkAuthError;
-use smol_str::SmolStr;
 use tokio_postgres::config::AuthKeys;

 use crate::auth::credentials::check_peer_addr_is_in_list;
@@ -16,7 +15,6 @@ use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
 use crate::proxy::NeonOptions;
-use crate::scram;
 use crate::stream::Stream;
 use crate::{
    auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -28,6 +26,7 @@ use crate::{
    },
    stream, url,
 };
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use futures::TryFutureExt;
 use std::borrow::Cow;
 use std::ops::ControlFlow;
@@ -130,19 +129,19 @@ pub struct ComputeCredentials<T> {

 #[derive(Debug, Clone)]
 pub struct ComputeUserInfoNoEndpoint {
-    pub user: SmolStr,
+    pub user: RoleName,
    pub options: NeonOptions,
 }

 #[derive(Debug, Clone)]
 pub struct ComputeUserInfo {
-    pub endpoint: SmolStr,
-    pub user: SmolStr,
+    pub endpoint: EndpointId,
+    pub user: RoleName,
    pub options: NeonOptions,
 }

 impl ComputeUserInfo {
-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
        self.options.get_cache_key(&self.endpoint)
    }
 }
@@ -158,7 +157,7 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
    type Error = ComputeUserInfoNoEndpoint;

    fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
-        match user_info.project {
+        match user_info.endpoint_id {
            None => Err(ComputeUserInfoNoEndpoint {
                user: user_info.user,
                options: user_info.options,
@@ -191,7 +190,10 @@ async fn auth_quirks(
        Err(info) => {
            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
                .await?;
-            ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
+
+            ctx.set_endpoint_id(res.info.endpoint.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
+
            (res.info, Some(res.keys))
        }
        Ok(info) => (info, None),
@@ -272,19 +274,12 @@ async fn authenticate_with_secret(
    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }

-/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
-/// only if authentication was successfuly.
-async fn auth_and_wake_compute(
+/// wake a compute (or retrieve an existing compute session from cache)
+async fn wake_compute(
    ctx: &mut RequestMonitoring,
    api: &impl console::Api,
-    user_info: ComputeUserInfoMaybeEndpoint,
-    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    allow_cleartext: bool,
-    config: &'static AuthenticationConfig,
+    compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
 ) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials =
-        auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
-
    let mut num_retries = 0;
    let mut node = loop {
        let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
@@ -317,11 +312,11 @@ async fn auth_and_wake_compute(

 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
    /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<SmolStr> {
+    pub fn get_endpoint(&self) -> Option<EndpointId> {
        use BackendType::*;

        match self {
-            Console(_, user_info) => user_info.project.clone(),
+            Console(_, user_info) => user_info.endpoint_id.clone(),
            Link(_) => Some("link".into()),
            #[cfg(test)]
            Test(_) => Some("test".into()),
@@ -355,20 +350,20 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
            Console(api, user_info) => {
                info!(
                    user = &*user_info.user,
-                    project = user_info.project(),
+                    project = user_info.endpoint(),
                    "performing authentication using the console"
                );

-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
+                let compute_credentials =
+                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
                (cache_info, BackendType::Console(api, user_info))
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");

-                let node_info = link::authenticate(&url, client).await?;
+                let node_info = link::authenticate(ctx, &url, client).await?;

                (
                    CachedNodeInfo::new_uncached(node_info),
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,7 @@
 use crate::{
    auth, compute,
    console::{self, provider::NodeInfo},
+    context::RequestMonitoring,
    error::UserFacingError,
    stream::PqStream,
    waiters,
@@ -54,6 +55,7 @@ pub fn new_psql_session_id() -> String {
 }

 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -94,6 +96,10 @@ pub(super) async fn authenticate(
        .dbname(&db_info.dbname)
        .user(&db_info.user);

+    ctx.set_user(db_info.user.into());
+    ctx.set_project(db_info.aux.clone());
+    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
+
    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
    // while direct connections do not. Once we migrate to pg_sni_proxy
    // everywhere, we can remove this.
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,8 @@

 use crate::{
    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -21,7 +22,10 @@ pub enum ComputeUserInfoParseError {
         SNI ('{}') and project option ('{}').",
        .domain, .option,
    )]
-    InconsistentProjectNames { domain: SmolStr, option: SmolStr },
+    InconsistentProjectNames {
+        domain: EndpointId,
+        option: EndpointId,
+    },

    #[error(
        "Common name inferred from SNI ('{}') is not known",
@@ -30,7 +34,7 @@ pub enum ComputeUserInfoParseError {
    UnknownCommonName { cn: String },

    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
-    MalformedProjectName(SmolStr),
+    MalformedProjectName(EndpointId),
 }

 impl UserFacingError for ComputeUserInfoParseError {}
@@ -39,24 +43,22 @@ impl UserFacingError for ComputeUserInfoParseError {}
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ComputeUserInfoMaybeEndpoint {
-    pub user: SmolStr,
-    // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<SmolStr>,
-
+    pub user: RoleName,
+    pub endpoint_id: Option<EndpointId>,
    pub options: NeonOptions,
 }

 impl ComputeUserInfoMaybeEndpoint {
    #[inline]
-    pub fn project(&self) -> Option<&str> {
-        self.project.as_deref()
+    pub fn endpoint(&self) -> Option<&str> {
+        self.endpoint_id.as_deref()
    }
 }

-pub fn endpoint_sni<'a>(
-    sni: &'a str,
+pub fn endpoint_sni(
+    sni: &str,
    common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
+) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
    let Some((subdomain, common_name)) = sni.split_once('.') else {
        return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
    };
@@ -65,7 +67,10 @@ pub fn endpoint_sni<'a>(
            cn: common_name.into(),
        });
    }
-    Ok(subdomain)
+    if subdomain == SERVERLESS_DRIVER_SNI {
+        return Ok(None);
+    }
+    Ok(Some(EndpointId::from(subdomain)))
 }

 impl ComputeUserInfoMaybeEndpoint {
@@ -79,15 +84,14 @@ impl ComputeUserInfoMaybeEndpoint {

        // Some parameters are stored in the startup message.
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user: SmolStr = get_param("user")?.into();
+        let user: RoleName = get_param("user")?.into();

        // record the values if we have them
        ctx.set_application(params.get("application_name").map(SmolStr::from));
        ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(SmolStr::from));

        // Project name might be passed via PG's command-line options.
-        let project_option = params
+        let endpoint_option = params
            .options_raw()
            .and_then(|options| {
                // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
@@ -100,9 +104,9 @@ impl ComputeUserInfoMaybeEndpoint {
            })
            .map(|name| name.into());

-        let project_from_domain = if let Some(sni_str) = sni {
+        let endpoint_from_domain = if let Some(sni_str) = sni {
            if let Some(cn) = common_names {
-                Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
+                endpoint_sni(sni_str, cn)?
            } else {
                None
            }
@@ -110,26 +114,31 @@ impl ComputeUserInfoMaybeEndpoint {
            None
        };

-        let project = match (project_option, project_from_domain) {
+        let endpoint = match (endpoint_option, endpoint_from_domain) {
            // Invariant: if we have both project name variants, they should match.
            (Some(option), Some(domain)) if option != domain => {
                Some(Err(InconsistentProjectNames { domain, option }))
            }
            // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
+            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
                false => Err(MalformedProjectName(name)),
                true => Ok(name),
            }),
        }
        .transpose()?;

-        info!(%user, project = project.as_deref(), "credentials");
+        if let Some(ep) = &endpoint {
+            ctx.set_endpoint_id(ep.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(ep));
+        }
+
+        info!(%user, project = endpoint.as_deref(), "credentials");
        if sni.is_some() {
            info!("Connection with sni");
            NUM_CONNECTION_ACCEPTED_BY_SNI
                .with_label_values(&["sni"])
                .inc();
-        } else if project.is_some() {
+        } else if endpoint.is_some() {
            NUM_CONNECTION_ACCEPTED_BY_SNI
                .with_label_values(&["no_sni"])
                .inc();
@@ -145,7 +154,7 @@ impl ComputeUserInfoMaybeEndpoint {

        Ok(Self {
            user,
-            project,
+            endpoint_id: endpoint,
            options,
        })
    }
@@ -238,7 +247,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);

        Ok(())
    }
@@ -253,7 +262,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);

        Ok(())
    }
@@ -269,7 +278,7 @@ mod tests {
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("foo"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
        assert_eq!(user_info.options.get_cache_key("foo"), "foo");

        Ok(())
@@ -285,7 +294,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

        Ok(())
    }
@@ -300,7 +309,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

        Ok(())
    }
@@ -318,7 +327,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());

        Ok(())
    }
@@ -333,7 +342,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());

        Ok(())
    }
@@ -349,7 +358,7 @@ mod tests {
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("baz"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));

        Ok(())
    }
@@ -363,14 +372,14 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
        let mut ctx = RequestMonitoring::test();
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        Ok(())
    }
@@ -427,7 +436,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("project"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
        assert_eq!(
            user_info.options.get_cache_key("project"),
            "project endpoint_type:read_write lsn:0/2"
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -4,10 +4,11 @@
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.

 use bstr::ByteSlice;
-use smol_str::SmolStr;
+
+use crate::EndpointId;

 pub struct PasswordHackPayload {
-    pub endpoint: SmolStr,
+    pub endpoint: EndpointId,
    pub password: Vec<u8>,
 }

--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -272,5 +272,5 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;

    let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
 }
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -11,13 +11,16 @@ use smol_str::SmolStr;
 use tokio::time::Instant;
 use tracing::{debug, info};

-use crate::{auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret};
+use crate::{
+    auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
+    RoleName,
+};

 use super::{Cache, Cached};

 pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
    fn enable_ttl(&self);
    fn disable_ttl(&self);
 }
@@ -44,7 +47,7 @@ impl<T> From<T> for Entry<T> {

 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<SmolStr, Entry<Option<AuthSecret>>>,
+    secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }

@@ -57,7 +60,7 @@ impl EndpointInfo {
    }
    pub fn get_role_secret(
        &self,
-        role_name: &SmolStr,
+        role_name: &RoleName,
        valid_since: Instant,
        ignore_cache_since: Option<Instant>,
    ) -> Option<(Option<AuthSecret>, bool)> {
@@ -90,7 +93,7 @@ impl EndpointInfo {
    pub fn invalidate_allowed_ips(&mut self) {
        self.allowed_ips = None;
    }
-    pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
+    pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
        self.secret.remove(role_name);
    }
 }
@@ -103,9 +106,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<SmolStr, EndpointInfo>,
+    cache: DashMap<EndpointId, EndpointInfo>,

-    project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
+    project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
    config: ProjectInfoCacheOptions,

    start_time: Instant,
@@ -113,7 +116,7 @@ pub struct ProjectInfoCacheImpl {
 }

 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
        info!("invalidating allowed ips for project `{}`", project_id);
        let endpoints = self
            .project2ep
@@ -126,7 +129,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
            }
        }
    }
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
        info!(
            "invalidating role secret for project_id `{}` and role_name `{}`",
            project_id, role_name
@@ -167,8 +170,8 @@ impl ProjectInfoCacheImpl {

    pub fn get_role_secret(
        &self,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
    ) -> Option<Cached<&Self, Option<AuthSecret>>> {
        let (valid_since, ignore_cache_since) = self.get_cache_times();
        let endpoint_info = self.cache.get(endpoint_id)?;
@@ -188,7 +191,7 @@ impl ProjectInfoCacheImpl {
    }
    pub fn get_allowed_ips(
        &self,
-        endpoint_id: &SmolStr,
+        endpoint_id: &EndpointId,
    ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
        let (valid_since, ignore_cache_since) = self.get_cache_times();
        let endpoint_info = self.cache.get(endpoint_id)?;
@@ -205,9 +208,9 @@ impl ProjectInfoCacheImpl {
    }
    pub fn insert_role_secret(
        &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
        secret: Option<AuthSecret>,
    ) {
        if self.cache.len() >= self.config.size {
@@ -222,8 +225,8 @@ impl ProjectInfoCacheImpl {
    }
    pub fn insert_allowed_ips(
        &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
        allowed_ips: Arc<Vec<IpPattern>>,
    ) {
        if self.cache.len() >= self.config.size {
@@ -236,7 +239,7 @@ impl ProjectInfoCacheImpl {
            .or_default()
            .allowed_ips = Some(allowed_ips.into());
    }
-    fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
+    fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
        if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
            endpoints.insert(endpoint_id.clone());
        } else {
@@ -297,18 +300,18 @@ impl ProjectInfoCacheImpl {
 /// This is used to invalidate cache entries.
 pub struct CachedLookupInfo {
    /// Search by this key.
-    endpoint_id: SmolStr,
+    endpoint_id: EndpointId,
    lookup_type: LookupType,
 }

 impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
+    pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
        Self {
            endpoint_id,
            lookup_type: LookupType::RoleSecret(role_name),
        }
    }
-    pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
        Self {
            endpoint_id,
            lookup_type: LookupType::AllowedIps,
@@ -317,7 +320,7 @@ impl CachedLookupInfo {
 }

 enum LookupType {
-    RoleSecret(SmolStr),
+    RoleSecret(RoleName),
    AllowedIps,
 }

@@ -348,7 +351,6 @@ impl Cache for ProjectInfoCacheImpl {
 mod tests {
    use super::*;
    use crate::{console::AuthSecret, scram::ServerSecret};
-    use smol_str::SmolStr;
    use std::{sync::Arc, time::Duration};

    #[tokio::test]
@@ -362,8 +364,8 @@ mod tests {
        });
        let project_id = "project".into();
        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
            user1.as_str(),
            [1; 32],
@@ -385,7 +387,7 @@ mod tests {
        assert_eq!(cached.value, secret2);

        // Shouldn't add more than 2 roles.
-        let user3: SmolStr = "user3".into();
+        let user3: RoleName = "user3".into();
        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
            user3.as_str(),
            [3; 32],
@@ -420,8 +422,8 @@ mod tests {

        let project_id = "project".into();
        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
            user1.as_str(),
            [1; 32],
@@ -475,8 +477,8 @@ mod tests {

        let project_id = "project".into();
        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
            user1.as_str(),
            [1; 32],
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,7 +1,7 @@
-use anyhow::{bail, Context};
+use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
-use std::net::SocketAddr;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
@@ -25,39 +25,31 @@ impl CancelMap {
    }

    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
-    where
-        F: FnOnce(Session<'a>) -> R,
-        R: std::future::Future<Output = anyhow::Result<V>>,
-    {
+    pub fn get_session(self: Arc<Self>) -> Session {
        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
        // expose it and we don't want to do another roundtrip to query
        // for it. The client will be able to notice that this is not the
        // actual backend_pid, but backend_pid is not used for anything
        // so it doesn't matter.
-        let key = rand::random();
+        let key = loop {
+            let key = rand::random();

-        // Random key collisions are unlikely to happen here, but they're still possible,
-        // which is why we have to take care not to rewrite an existing key.
-        match self.0.entry(key) {
-            dashmap::mapref::entry::Entry::Occupied(_) => {
-                bail!("query cancellation key already exists: {key}")
+            // Random key collisions are unlikely to happen here, but they're still possible,
+            // which is why we have to take care not to rewrite an existing key.
+            match self.0.entry(key) {
+                dashmap::mapref::entry::Entry::Occupied(_) => continue,
+                dashmap::mapref::entry::Entry::Vacant(e) => {
+                    e.insert(None);
+                }
            }
-            dashmap::mapref::entry::Entry::Vacant(e) => {
-                e.insert(None);
-            }
-        }
-
-        // This will guarantee that the session gets dropped
-        // as soon as the future is finished.
-        scopeguard::defer! {
-            self.0.remove(&key);
-            info!("dropped query cancellation key {key}");
-        }
+            break key;
+        };

        info!("registered new query cancellation key {key}");
-        let session = Session::new(key, self);
-        f(session).await
+        Session {
+            key,
+            cancel_map: self,
+        }
    }

    #[cfg(test)]
@@ -98,23 +90,17 @@ impl CancelClosure {
 }

 /// Helper for registering query cancellation tokens.
-pub struct Session<'a> {
+pub struct Session {
    /// The user-facing key identifying this session.
    key: CancelKeyData,
    /// The [`CancelMap`] this session belongs to.
-    cancel_map: &'a CancelMap,
+    cancel_map: Arc<CancelMap>,
 }

-impl<'a> Session<'a> {
-    fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
-        Self { key, cancel_map }
-    }
-}
-
-impl Session<'_> {
+impl Session {
    /// Store the cancel token for the given session.
    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
+    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
        info!("enabling query cancellation for this session");
        self.cancel_map.0.insert(self.key, Some(cancel_closure));

@@ -122,37 +108,26 @@ impl Session<'_> {
    }
 }

+impl Drop for Session {
+    fn drop(&mut self) {
+        self.cancel_map.0.remove(&self.key);
+        info!("dropped query cancellation key {}", &self.key);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
-    use once_cell::sync::Lazy;

    #[tokio::test]
    async fn check_session_drop() -> anyhow::Result<()> {
-        static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
-
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
-            assert!(CANCEL_MAP.contains(&session));
-
-            tx.send(()).expect("failed to send");
-            futures::future::pending::<()>().await; // sleep forever
-
-            Ok(())
-        }));
-
-        // Wait until the task has been spawned.
-        rx.await.context("failed to hear from the task")?;
-
-        // Drop the session's entry by cancelling the task.
-        task.abort();
-        let error = task.await.expect_err("task should have failed");
-        if !error.is_cancelled() {
-            anyhow::bail!(error);
-        }
+        let cancel_map: Arc<CancelMap> = Default::default();

+        let session = cancel_map.clone().get_session();
+        assert!(cancel_map.contains(&session));
+        drop(session);
        // Check that the session has been dropped.
-        assert!(CANCEL_MAP.is_empty());
+        assert!(cancel_map.is_empty());

        Ok(())
    }
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,9 +1,10 @@
 use serde::Deserialize;
-use smol_str::SmolStr;
 use std::fmt;

 use crate::auth::IpPattern;

+use crate::{BranchId, EndpointId, ProjectId};
+
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
@@ -17,7 +18,7 @@ pub struct ConsoleError {
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
    pub allowed_ips: Option<Vec<IpPattern>>,
-    pub project_id: Option<Box<str>>,
+    pub project_id: Option<ProjectId>,
 }

 // Manually implement debug to omit sensitive info.
@@ -94,9 +95,9 @@ impl fmt::Debug for DatabaseInfo {
 /// Also known as `ProxyMetricsAuxInfo` in the console.
 #[derive(Debug, Deserialize, Clone, Default)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: SmolStr,
-    pub project_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub project_id: ProjectId,
+    pub branch_id: BranchId,
 }

 impl MetricsAuxInfo {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -9,11 +9,10 @@ use crate::{
    compute,
    config::{CacheOptions, ProjectInfoCacheOptions},
    context::RequestMonitoring,
-    scram,
+    scram, EndpointCacheKey, ProjectId,
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
-use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
@@ -214,7 +213,7 @@ pub struct AuthInfo {
    /// List of IP addresses allowed for the autorization.
    pub allowed_ips: Vec<IpPattern>,
    /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<SmolStr>,
+    pub project_id: Option<ProjectId>,
 }

 /// Info for establishing a connection to a compute node.
@@ -233,7 +232,7 @@ pub struct NodeInfo {
    pub allow_self_signed_compute: bool,
 }

-pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
+pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
@@ -345,7 +344,7 @@ impl ApiCaches {
 /// Various caches for [`console`](super).
 pub struct ApiLocks {
    name: &'static str,
-    node_locks: DashMap<SmolStr, Arc<Semaphore>>,
+    node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
    permits: usize,
    timeout: Duration,
    registered: prometheus::IntCounter,
@@ -413,7 +412,7 @@ impl ApiLocks {

    pub async fn get_wake_compute_permit(
        &self,
-        key: &SmolStr,
+        key: &EndpointCacheKey,
    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
        if self.permits == 0 {
            return Ok(WakeComputePermit { permit: None });
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,7 +14,6 @@ use crate::{
 };
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use smol_str::SmolStr;
 use std::sync::Arc;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
@@ -98,7 +97,7 @@ impl Api {
            Ok(AuthInfo {
                secret,
                allowed_ips,
-                project_id: body.project_id.map(SmolStr::from),
+                project_id: body.project_id,
            })
        }
        .map_err(crate::error::log_error)
@@ -239,7 +238,7 @@ impl super::Api for Api {
        // for some time (highly depends on the console's scale-to-zero policy);
        // The connection info remains the same during that period of time,
        // which means that we might cache it to reduce the load and latency.
-        if let Some(cached) = self.caches.node_info.get(&*key) {
+        if let Some(cached) = self.caches.node_info.get(&key) {
            info!(key = &*key, "found cached compute node info");
            return Ok(cached);
        }
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,7 +7,10 @@ use std::net::IpAddr;
 use tokio::sync::mpsc;
 use uuid::Uuid;

-use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
+use crate::{
+    console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
+    EndpointId, ProjectId, RoleName,
+};

 pub mod parquet;

@@ -26,10 +29,10 @@ pub struct RequestMonitoring {
    region: &'static str,

    // filled in as they are discovered
-    project: Option<SmolStr>,
-    branch: Option<SmolStr>,
-    endpoint_id: Option<SmolStr>,
-    user: Option<SmolStr>,
+    project: Option<ProjectId>,
+    branch: Option<BranchId>,
+    endpoint_id: Option<EndpointId>,
+    user: Option<RoleName>,
    application: Option<SmolStr>,
    error_kind: Option<ErrorKind>,
    success: bool,
@@ -86,15 +89,18 @@ impl RequestMonitoring {
        self.project = Some(x.project_id);
    }

-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
-        self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
+    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        crate::metrics::CONNECTING_ENDPOINTS
+            .with_label_values(&[self.protocol])
+            .measure(&endpoint_id);
+        self.endpoint_id = Some(endpoint_id);
    }

    pub fn set_application(&mut self, app: Option<SmolStr>) {
        self.application = app.or_else(|| self.application.clone());
    }

-    pub fn set_user(&mut self, user: SmolStr) {
+    pub fn set_user(&mut self, user: RoleName) {
        self.user = Some(user);
    }

--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -62,3 +62,79 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
 pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
    r.context("join error").and_then(|x| x)
 }
+
+macro_rules! smol_str_wrapper {
+    ($name:ident) => {
+        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+        pub struct $name(smol_str::SmolStr);
+
+        impl $name {
+            pub fn as_str(&self) -> &str {
+                self.0.as_str()
+            }
+        }
+
+        impl std::fmt::Display for $name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                self.0.fmt(f)
+            }
+        }
+
+        impl<T> std::cmp::PartialEq<T> for $name
+        where
+            smol_str::SmolStr: std::cmp::PartialEq<T>,
+        {
+            fn eq(&self, other: &T) -> bool {
+                self.0.eq(other)
+            }
+        }
+
+        impl<T> From<T> for $name
+        where
+            smol_str::SmolStr: From<T>,
+        {
+            fn from(x: T) -> Self {
+                Self(x.into())
+            }
+        }
+
+        impl AsRef<str> for $name {
+            fn as_ref(&self) -> &str {
+                self.0.as_ref()
+            }
+        }
+
+        impl std::ops::Deref for $name {
+            type Target = str;
+            fn deref(&self) -> &str {
+                &*self.0
+            }
+        }
+
+        impl<'de> serde::de::Deserialize<'de> for $name {
+            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
+            }
+        }
+
+        impl serde::Serialize for $name {
+            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+                self.0.serialize(s)
+            }
+        }
+    };
+}
+
+// 90% of role name strings are 20 characters or less.
+smol_str_wrapper!(RoleName);
+// 50% of endpoint strings are 23 characters or less.
+smol_str_wrapper!(EndpointId);
+// 50% of branch strings are 23 characters or less.
+smol_str_wrapper!(BranchId);
+// 90% of project strings are 23 characters or less.
+smol_str_wrapper!(ProjectId);
+
+// will usually equal endpoint ID
+smol_str_wrapper!(EndpointCacheKey);
+
+smol_str_wrapper!(DbName);
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,10 +1,7 @@
 use ::metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
+    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
+    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };

 use once_cell::sync::Lazy;
@@ -236,3 +233,13 @@ pub const fn bool_to_str(x: bool) -> &'static str {
        "false"
    }
 }
+
+pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
+    register_hll_vec!(
+        32,
+        "proxy_connecting_endpoints",
+        "HLL approximate cardinality of endpoints that are connecting",
+        &["protocol"],
+    )
+    .unwrap()
+});
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,36 +2,34 @@
 mod tests;

 pub mod connect_compute;
+pub mod handshake;
+pub mod passthrough;
 pub mod retry;

 use crate::{
    auth,
    cancellation::{self, CancelMap},
    compute,
-    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::messages::MetricsAuxInfo,
+    config::{ProxyConfig, TlsConfig},
    context::RequestMonitoring,
-    metrics::{
-        NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
-        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
-    },
+    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
    protocol2::WithClientIp,
+    proxy::{handshake::handshake, passthrough::proxy_pass},
    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
-    usage_metrics::{Ids, USAGE_METRICS},
+    EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
-use smol_str::SmolStr;
+use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
-use utils::measured_stream::MeasuredStream;

 use self::connect_compute::{connect_to_compute, TcpMechanism};

@@ -79,6 +77,13 @@ pub async fn task_main(
        let cancel_map = Arc::clone(&cancel_map);
        let endpoint_rate_limiter = endpoint_rate_limiter.clone();

+        let session_span = info_span!(
+            "handle_client",
+            ?session_id,
+            peer_addr = tracing::field::Empty,
+            ep = tracing::field::Empty,
+        );
+
        connections.spawn(
            async move {
                info!("accepted postgres client connection");
@@ -102,22 +107,18 @@ pub async fn task_main(
                handle_client(
                    config,
                    &mut ctx,
-                    &cancel_map,
+                    cancel_map,
                    socket,
                    ClientMode::Tcp,
                    endpoint_rate_limiter,
                )
                .await
            }
-            .instrument(info_span!(
-                "handle_client",
-                ?session_id,
-                peer_addr = tracing::field::Empty
-            ))
            .unwrap_or_else(move |e| {
                // Acknowledge that the task has finished with an error.
-                error!(?session_id, "per-client task finished with an error: {e:#}");
-            }),
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(session_span),
        );
    }

@@ -170,7 +171,7 @@ impl ClientMode {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
    stream: S,
    mode: ClientMode,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -191,138 +192,88 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let tls = config.tls_config.as_ref();

    let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
    let (mut stream, params) = match do_handshake.await? {
        Some(x) => x,
        None => return Ok(()), // it's a cancellation request
    };
    drop(pause);

+    let hostname = mode.hostname(stream.get_ref());
+
+    let common_names = tls.map(|tls| &tls.common_names);
+
    // Extract credentials which we're going to use for auth.
-    let user_info = {
-        let hostname = mode.hostname(stream.get_ref());
+    let result = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
+        .transpose();

-        let common_names = tls.map(|tls| &tls.common_names);
-        let result = config
-            .auth_backend
-            .as_ref()
-            .map(|_| {
-                auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names)
-            })
-            .transpose();
+    let user_info = match result {
+        Ok(user_info) => user_info,
+        Err(e) => stream.throw_error(e).await?,
+    };

-        match result {
-            Ok(user_info) => user_info,
-            Err(e) => stream.throw_error(e).await?,
+    // check rate limit
+    if let Some(ep) = user_info.get_endpoint() {
+        if !endpoint_rate_limiter.check(ep) {
+            return stream
+                .throw_error(auth::AuthError::too_many_connections())
+                .await;
+        }
+    }
+
+    let user = user_info.get_user().to_owned();
+    let (mut node_info, user_info) = match user_info
+        .authenticate(
+            ctx,
+            &mut stream,
+            mode.allow_cleartext(),
+            &config.authentication_config,
+        )
+        .await
+    {
+        Ok(auth_result) => auth_result,
+        Err(e) => {
+            let db = params.get("database");
+            let app = params.get("application_name");
+            let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+            return stream.throw_error(e).instrument(params_span).await;
        }
    };

-    ctx.set_endpoint_id(user_info.get_endpoint());
+    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);

-    let client = Client::new(
-        stream,
-        user_info,
-        &params,
-        mode.allow_self_signed_compute(config),
-        endpoint_rate_limiter,
-    );
-    cancel_map
-        .with_session(|session| {
-            client.connect_to_db(ctx, session, mode, &config.authentication_config)
-        })
-        .await
-}
+    let aux = node_info.aux.clone();
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism { params: &params },
+        node_info,
+        &user_info,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;

-/// Establish a (most probably, secure) connection with the client.
-/// For better testing experience, `stream` can be any object satisfying the traits.
-/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
-/// we also take an extra care of propagating only the select handshake errors to client.
-#[tracing::instrument(skip_all)]
-async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    stream: S,
-    mut tls: Option<&TlsConfig>,
-    cancel_map: &CancelMap,
-) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
-    // Client may try upgrading to each protocol only once
-    let (mut tried_ssl, mut tried_gss) = (false, false);
+    let session = cancel_map.get_session();
+    prepare_client_connection(&node, &session, &mut stream).await?;

-    let mut stream = PqStream::new(Stream::from_raw(stream));
-    loop {
-        let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;

-        use FeStartupPacket::*;
-        match msg {
-            SslRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_ssl => {
-                    tried_ssl = true;
-
-                    // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
-                    if let Some(tls) = tls.take() {
-                        // Upgrade raw stream into a secure TLS-backed stream.
-                        // NOTE: We've consumed `tls`; this fact will be used later.
-
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
-                        if !read_buf.is_empty() {
-                            bail!("data is sent before server replied with EncryptionResponse");
-                        }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
-
-                        let (_, tls_server_end_point) = tls
-                            .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
-                            .context("missing certificate")?;
-
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
-                    }
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            GssEncRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_gss => {
-                    tried_gss = true;
-
-                    // Currently, we don't support GSSAPI
-                    stream.write_message(&Be::EncryptionResponse(false)).await?;
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            StartupMessage { params, .. } => {
-                // Check that the config has been consumed during upgrade
-                // OR we didn't provide it at all (for dev purposes).
-                if tls.is_some() {
-                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
-                }
-
-                info!(session_type = "normal", "successful handshake");
-                break Ok(Some((stream, params)));
-            }
-            CancelRequest(cancel_key_data) => {
-                cancel_map.cancel_session(cancel_key_data).await?;
-
-                info!(session_type = "cancellation", "successful handshake");
-                break Ok(None);
-            }
-        }
-    }
+    proxy_pass(ctx, stream, node.stream, aux).await
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
    node: &compute::PostgresConnection,
-    session: cancellation::Session<'_>,
+    session: &cancellation::Session,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> anyhow::Result<()> {
    // Register compute's query cancellation token and produce a new, unique one.
@@ -348,151 +299,6 @@ async fn prepare_client_connection(
    Ok(())
 }

-/// Forward bytes in both directions (client <-> compute).
-#[tracing::instrument(skip_all)]
-pub async fn proxy_pass(
-    ctx: &mut RequestMonitoring,
-    client: impl AsyncRead + AsyncWrite + Unpin,
-    compute: impl AsyncRead + AsyncWrite + Unpin,
-    aux: MetricsAuxInfo,
-) -> anyhow::Result<()> {
-    ctx.set_success();
-    ctx.log();
-
-    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
-    });
-
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
-    let mut client = MeasuredStream::new(
-        client,
-        |_| {},
-        |cnt| {
-            // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
-            usage.record_egress(cnt as u64);
-        },
-    );
-
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
-    let mut compute = MeasuredStream::new(
-        compute,
-        |_| {},
-        |cnt| {
-            // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
-        },
-    );
-
-    // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
-
-    Ok(())
-}
-
-/// Thin connection context.
-struct Client<'a, S> {
-    /// The underlying libpq protocol stream.
-    stream: PqStream<Stream<S>>,
-    /// Client credentials that we care about.
-    user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-    /// KV-dictionary with PostgreSQL connection params.
-    params: &'a StartupMessageParams,
-    /// Allow self-signed certificates (for testing).
-    allow_self_signed_compute: bool,
-    /// Rate limiter for endpoints
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-}
-
-impl<'a, S> Client<'a, S> {
-    /// Construct a new connection context.
-    fn new(
-        stream: PqStream<Stream<S>>,
-        user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-        params: &'a StartupMessageParams,
-        allow_self_signed_compute: bool,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> Self {
-        Self {
-            stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        }
-    }
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
-    /// Let the client authenticate and connect to the designated compute node.
-    // Instrumentation logs endpoint name everywhere. Doesn't work for link
-    // auth; strictly speaking we don't know endpoint name in its case.
-    #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)]
-    async fn connect_to_db(
-        self,
-        ctx: &mut RequestMonitoring,
-        session: cancellation::Session<'_>,
-        mode: ClientMode,
-        config: &'static AuthenticationConfig,
-    ) -> anyhow::Result<()> {
-        let Self {
-            mut stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        } = self;
-
-        // check rate limit
-        if let Some(ep) = user_info.get_endpoint() {
-            if !endpoint_rate_limiter.check(ep) {
-                return stream
-                    .throw_error(auth::AuthError::too_many_connections())
-                    .await;
-            }
-        }
-
-        let user = user_info.get_user().to_owned();
-        let auth_result = match user_info
-            .authenticate(ctx, &mut stream, mode.allow_cleartext(), config)
-            .await
-        {
-            Ok(auth_result) => auth_result,
-            Err(e) => {
-                let db = params.get("database");
-                let app = params.get("application_name");
-                let params_span = tracing::info_span!("", ?user, ?db, ?app);
-
-                return stream.throw_error(e).instrument(params_span).await;
-            }
-        };
-
-        let (mut node_info, user_info) = auth_result;
-
-        node_info.allow_self_signed_compute = allow_self_signed_compute;
-
-        let aux = node_info.aux.clone();
-        let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info)
-            .or_else(|e| stream.throw_error(e))
-            .await?;
-
-        prepare_client_connection(&node, session, &mut stream).await?;
-        // Before proxy passing, forward to compute whatever data is left in the
-        // PqStream input buffer. Normally there is none, but our serverless npm
-        // driver in pipeline mode sends startup, password and first query
-        // immediately after opening the connection.
-        let (stream, read_buf) = stream.into_inner();
-        node.stream.write_all(&read_buf).await?;
-        proxy_pass(ctx, stream, node.stream, aux).await
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
 pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);

@@ -516,20 +322,21 @@ impl NeonOptions {
        Self(options)
    }

-    pub fn get_cache_key(&self, prefix: &str) -> SmolStr {
+    pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
        // prefix + format!(" {k}:{v}")
        // kinda jank because SmolStr is immutable
        std::iter::once(prefix)
            .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v]))
-            .collect()
+            .collect::<SmolStr>()
+            .into()
    }

    /// <https://swagger.io/docs/specification/serialization/> DeepObject format
    /// `paramName[prop1]=value1&paramName[prop2]=value2&...`
-    pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> {
+    pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
        self.0
            .iter()
-            .map(|(k, v)| (format!("options[{}]", k), v.clone()))
+            .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
            .collect()
    }
 }
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -0,0 +1,96 @@
+use anyhow::{bail, Context};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+
+use crate::{
+    cancellation::CancelMap,
+    config::TlsConfig,
+    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
+    stream::{PqStream, Stream},
+};
+
+/// Establish a (most probably, secure) connection with the client.
+/// For better testing experience, `stream` can be any object satisfying the traits.
+/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
+/// we also take an extra care of propagating only the select handshake errors to client.
+#[tracing::instrument(skip_all)]
+pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    stream: S,
+    mut tls: Option<&TlsConfig>,
+    cancel_map: &CancelMap,
+) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
+    // Client may try upgrading to each protocol only once
+    let (mut tried_ssl, mut tried_gss) = (false, false);
+
+    let mut stream = PqStream::new(Stream::from_raw(stream));
+    loop {
+        let msg = stream.read_startup_packet().await?;
+        info!("received {msg:?}");
+
+        use FeStartupPacket::*;
+        match msg {
+            SslRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_ssl => {
+                    tried_ssl = true;
+
+                    // We can't perform TLS handshake without a config
+                    let enc = tls.is_some();
+                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    if let Some(tls) = tls.take() {
+                        // Upgrade raw stream into a secure TLS-backed stream.
+                        // NOTE: We've consumed `tls`; this fact will be used later.
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+
+                        let (_, tls_server_end_point) = tls
+                            .cert_resolver
+                            .resolve(tls_stream.get_ref().1.server_name())
+                            .context("missing certificate")?;
+
+                        stream = PqStream::new(Stream::Tls {
+                            tls: Box::new(tls_stream),
+                            tls_server_end_point,
+                        });
+                    }
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            GssEncRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_gss => {
+                    tried_gss = true;
+
+                    // Currently, we don't support GSSAPI
+                    stream.write_message(&Be::EncryptionResponse(false)).await?;
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            StartupMessage { params, .. } => {
+                // Check that the config has been consumed during upgrade
+                // OR we didn't provide it at all (for dev purposes).
+                if tls.is_some() {
+                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
+                }
+
+                info!(session_type = "normal", "successful handshake");
+                break Ok(Some((stream, params)));
+            }
+            CancelRequest(cancel_key_data) => {
+                cancel_map.cancel_session(cancel_key_data).await?;
+
+                info!(session_type = "cancellation", "successful handshake");
+                break Ok(None);
+            }
+        }
+    }
+}
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -0,0 +1,57 @@
+use crate::{
+    console::messages::MetricsAuxInfo,
+    context::RequestMonitoring,
+    metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
+    usage_metrics::{Ids, USAGE_METRICS},
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+use utils::measured_stream::MeasuredStream;
+
+/// Forward bytes in both directions (client <-> compute).
+#[tracing::instrument(skip_all)]
+pub async fn proxy_pass(
+    ctx: &mut RequestMonitoring,
+    client: impl AsyncRead + AsyncWrite + Unpin,
+    compute: impl AsyncRead + AsyncWrite + Unpin,
+    aux: MetricsAuxInfo,
+) -> anyhow::Result<()> {
+    ctx.set_success();
+    ctx.log();
+
+    let usage = USAGE_METRICS.register(Ids {
+        endpoint_id: aux.endpoint_id.clone(),
+        branch_id: aux.branch_id.clone(),
+    });
+
+    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
+    let mut client = MeasuredStream::new(
+        client,
+        |_| {},
+        |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+            m_sent2.inc_by(cnt as u64);
+            usage.record_egress(cnt as u64);
+        },
+    );
+
+    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
+    let mut compute = MeasuredStream::new(
+        compute,
+        |_| {},
+        |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+            m_recv2.inc_by(cnt as u64);
+        },
+    );
+
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+
+    Ok(())
+}
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -11,11 +11,12 @@ use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;

+use crate::EndpointId;
+
 use super::{
    limit_algorithm::{LimitAlgorithm, Sample},
    RateLimiterConfig,
@@ -33,7 +34,7 @@ use super::{
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
 pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
+    map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
    info: &'static [RateBucketInfo],
    access_count: AtomicUsize,
    rand: Mutex<Rand>,
@@ -146,7 +147,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
    }

    /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, endpoint: SmolStr) -> bool {
+    pub fn check(&self, endpoint: EndpointId) -> bool {
        // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
        // worst case memory usage is about:
        //    = 2 * 2048 * 64 * (48B + 72B)
@@ -493,11 +494,13 @@ mod tests {
    use futures::{task::noop_waker_ref, Future};
    use rand::SeedableRng;
    use rustc_hash::FxHasher;
-    use smol_str::SmolStr;
    use tokio::time;

    use super::{EndpointRateLimiter, Limiter, Outcome};
-    use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
+    use crate::{
+        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
+        EndpointId,
+    };

    #[tokio::test]
    async fn it_works() {
@@ -654,7 +657,7 @@ mod tests {
        RateBucketInfo::validate(&mut rates).unwrap();
        let limiter = EndpointRateLimiter::new(Vec::leak(rates));

-        let endpoint = SmolStr::from("ep-my-endpoint-1234");
+        let endpoint = EndpointId::from("ep-my-endpoint-1234");

        time::pause();

--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -3,9 +3,8 @@ use std::{convert::Infallible, sync::Arc};
 use futures::StreamExt;
 use redis::aio::PubSub;
 use serde::Deserialize;
-use smol_str::SmolStr;

-use crate::cache::project_info::ProjectInfoCache;
+use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};

 const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -46,12 +45,12 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    project_id: SmolStr,
+    project_id: ProjectId,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    project_id: SmolStr,
-    role_name: SmolStr,
+    project_id: ProjectId,
+    role_name: RoleName,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

+pub const SERVERLESS_DRIVER_SNI: &str = "api";
+
 pub async fn task_main(
    config: &'static ProxyConfig,
    ws_listener: TcpListener,
@@ -228,7 +230,7 @@ async fn request_handler(
                    config,
                    &mut ctx,
                    websocket,
-                    &cancel_map,
+                    cancel_map,
                    host,
                    endpoint_rate_limiter,
                )
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -31,6 +31,7 @@ use crate::{
    metrics::NUM_DB_CONNECTIONS_GAUGE,
    proxy::connect_compute::ConnectMechanism,
    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
+    DbName, EndpointCacheKey, RoleName,
 };
 use crate::{compute, config};

@@ -42,17 +43,17 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
    pub user_info: ComputeUserInfo,
-    pub dbname: SmolStr,
+    pub dbname: DbName,
    pub password: SmolStr,
 }

 impl ConnInfo {
    // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
+    pub fn db_and_user(&self) -> (DbName, RoleName) {
        (self.dbname.clone(), self.user_info.user.clone())
    }

-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
        self.user_info.endpoint_cache_key()
    }
 }
@@ -79,14 +80,14 @@ struct ConnPoolEntry {
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
+    pools: HashMap<(DbName, RoleName), DbUserConnPool>,
    total_conns: usize,
    max_conns: usize,
    _guard: IntCounterPairGuard,
 }

 impl EndpointConnPool {
-    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry> {
        let Self {
            pools, total_conns, ..
        } = self;
@@ -95,7 +96,7 @@ impl EndpointConnPool {
            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
    }

-    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
+    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
        let Self {
            pools, total_conns, ..
        } = self;
@@ -196,7 +197,7 @@ pub struct GlobalConnPool {
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,

    /// Number of endpoint-connection pools
    ///
@@ -440,7 +441,10 @@ impl GlobalConnPool {
        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
    }

-    fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(
+        &self,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool>> {
        // fast path
        if let Some(pool) = self.global_pool.get(endpoint) {
            return pool.clone();
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;

 use anyhow::bail;
+use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -13,7 +14,6 @@ use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
-use smol_str::SmolStr;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::types::Kind;
@@ -36,9 +36,11 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::RoleName;

 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::SERVERLESS_DRIVER_SNI;

 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -60,7 +62,6 @@ enum Payload {

 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -155,7 +156,7 @@ fn get_conn_info(
        .next()
        .ok_or(anyhow::anyhow!("invalid database name"))?;

-    let username = SmolStr::from(connection_url.username());
+    let username = RoleName::from(connection_url.username());
    if username.is_empty() {
        return Err(anyhow::anyhow!("missing username"));
    }
@@ -187,10 +188,8 @@ fn get_conn_info(
        }
    }

-    let endpoint = endpoint_sni(hostname, &tls.common_names)?;
-
-    let endpoint: SmolStr = endpoint.into();
-    ctx.set_endpoint_id(Some(endpoint.clone()));
+    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
+    ctx.set_endpoint_id(endpoint.clone());

    let pairs = connection_url.query_pairs();

@@ -226,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
    let (_, hostname_rest) = hostname
        .split_once('.')
        .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest
-        && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
+    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
 }

 // TODO: return different http error codes
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
    websocket: HyperWebsocket,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
    hostname: Option<String>,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,12 +1,11 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{config::MetricCollectionConfig, http};
+use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
-use smol_str::SmolStr;
 use std::{
    convert::Infallible,
    sync::{
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub branch_id: BranchId,
 }

 #[derive(Debug)]
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -28,7 +28,7 @@ use crate::safekeeper::Term;
 use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
-use crate::{copy_timeline, debug_dump, pull_timeline};
+use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};

 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -465,6 +465,26 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
    Ok(response)
 }

+async fn patch_control_file_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let patch_request: patch_control_file::Request = json_request(&mut request).await?;
+    let response = patch_control_file::handle_request(tli, patch_request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router();
@@ -526,6 +546,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
            |r| request_span(r, timeline_copy_handler),
        )
+        .patch(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
+            |r| request_span(r, patch_control_file_handler),
+        )
        // for tests
        .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
            request_span(r, record_safekeeper_info)
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -22,6 +22,7 @@ pub mod handler;
 pub mod http;
 pub mod json_ctrl;
 pub mod metrics;
+pub mod patch_control_file;
 pub mod pull_timeline;
 pub mod receive_wal;
 pub mod recovery;
--- a/Show More
+++ b/Show More