diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
index e401b2f418..892e21114b 100644
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -69,7 +69,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
 
   kaniko-arm:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -85,7 +93,15 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          /kaniko/executor \
+            --reproducible \
+            --snapshotMode=redo \
+            --skip-unused-stages \
+            --dockerfile ${{ inputs.dockerfile-path }} \
+            --cache=true \
+            --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
+            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
   manifest:
     if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
@@ -99,7 +115,10 @@ jobs:
 
     steps:
       - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+        run: |
+          docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 \
+                         --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
 
       - name: Push manifest
         run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 2b88f09b3d..147d5cae2d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,6 +21,8 @@ env:
   COPT: '-Werror'
   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
 
 jobs:
   check-permissions:
@@ -44,6 +46,20 @@ jobs:
 
         exit 1
 
+  cancel-previous-e2e-tests:
+    needs: [ check-permissions ]
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Cancel previous e2e-tests runs for this PR
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          gh workflow --repo neondatabase/cloud \
+            run cancel-previous-in-concurrency-group.yml \
+              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
+
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
@@ -186,7 +202,11 @@ jobs:
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      options: --init
+      # Raise locked memory limit for tokio-epoll-uring.
+      # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
+      # io_uring will account the memory of the CQ and SQ as locked.
+      # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
       fail-fast: false
       matrix:
@@ -340,8 +360,12 @@ jobs:
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
         run: |
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -419,8 +443,8 @@ jobs:
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     strategy:
       fail-fast: false
       matrix:
@@ -448,6 +472,7 @@ jobs:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
           BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
 
       - name: Merge and upload coverage data
         if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -458,12 +483,13 @@ jobs:
     runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-      # Default shared memory is 64mb
-      options: --init --shm-size=512mb
+      # for changed limits, see comments on `options:` earlier in this file
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
     strategy:
       fail-fast: false
       matrix:
+        # the amount of groups (N) should be reflected in `extra_params: --splits N ...`
         pytest_split_group: [ 1, 2, 3, 4 ]
         build_type: [ release ]
     steps:
@@ -477,11 +503,12 @@ jobs:
           test_selection: performance
           run_in_parallel: false
           save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
+          extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
         env:
           VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
           PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
           TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
@@ -695,7 +722,8 @@ jobs:
                 \"commit_hash\": \"$COMMIT_SHA\",
                 \"remote_repo\": \"${{ github.repository }}\",
                 \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
               }
             }"
 
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index c6c2b7386a..f8fb62d3f8 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -124,12 +124,12 @@ jobs:
       # Hence keeping target/ (and general cache size) smaller
       BUILD_TYPE: release
       CARGO_FEATURES: --features testing
-      CARGO_FLAGS: --locked --release
+      CARGO_FLAGS: --release
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
       options: --init
 
     steps:
@@ -210,18 +210,20 @@ jobs:
 
       - name: Run cargo build
         run: |
-          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       - name: Run cargo test
+        env:
+          NEXTEST_RETRIES: 3
         run: |
-          cargo test $CARGO_FLAGS $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -231,7 +233,7 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure
 
   check-codestyle-rust-arm:
     timeout-minutes: 90
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
index 88bab797b7..900724fc60 100644
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -20,111 +20,51 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
 permissions: {}
 
 jobs:
   tag-image:
     runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
 
     env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      ECR_IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      DOCKER_HUB_IMAGE: docker.io/neondatabase/build-tools
       FROM_TAG: ${{ inputs.from-tag }}
       TO_TAG: ${{ inputs.to-tag }}
 
     steps:
-      - name: Install Crane & ECR helper
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
         run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
 
-      - name: Configure ECR login
+      - uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v2
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install crane
         run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+          go install github.com/google/go-containerregistry/cmd/crane@a0658aa1d0cc7a7f1bcc4a3af9155335b6943f40 # v0.18.0
 
-      - name: Restore previous tag if needed
+      - name: Copy images
         run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${ECR_IMAGE}:${TO_TAG}"
+          crane copy "${ECR_IMAGE}:${FROM_TAG}" "${DOCKER_HUB_IMAGE}:${TO_TAG}"
 
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
diff --git a/Cargo.lock b/Cargo.lock
index 952034a16b..f0bcfb762a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5"
 
 [[package]]
 name = "addr2line"
-version = "0.19.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
 dependencies = [
  "gimli",
 ]
@@ -278,6 +278,7 @@ dependencies = [
  "camino",
  "clap",
  "control_plane",
+ "diesel",
  "futures",
  "git-version",
  "hyper",
@@ -840,15 +841,15 @@ dependencies = [
 
 [[package]]
 name = "backtrace"
-version = "0.3.67"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
 dependencies = [
  "addr2line",
  "cc",
  "cfg-if",
  "libc",
- "miniz_oxide 0.6.2",
+ "miniz_oxide",
  "object",
  "rustc-demangle",
 ]
@@ -1215,7 +1216,7 @@ dependencies = [
  "flate2",
  "futures",
  "hyper",
- "nix 0.26.2",
+ "nix 0.27.1",
  "notify",
  "num_cpus",
  "opentelemetry",
@@ -1327,11 +1328,13 @@ dependencies = [
  "clap",
  "comfy-table",
  "compute_api",
+ "diesel",
+ "diesel_migrations",
  "futures",
  "git-version",
  "hex",
  "hyper",
- "nix 0.26.2",
+ "nix 0.27.1",
  "once_cell",
  "pageserver_api",
  "pageserver_client",
@@ -1341,6 +1344,7 @@ dependencies = [
  "regex",
  "reqwest",
  "safekeeper_api",
+ "scopeguard",
  "serde",
  "serde_json",
  "serde_with",
@@ -1636,6 +1640,52 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "diesel"
+version = "2.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8"
+dependencies = [
+ "bitflags 2.4.1",
+ "byteorder",
+ "diesel_derives",
+ "itoa",
+ "pq-sys",
+ "serde_json",
+]
+
+[[package]]
+name = "diesel_derives"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44"
+dependencies = [
+ "diesel_table_macro_syntax",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.32",
+]
+
+[[package]]
+name = "diesel_migrations"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac"
+dependencies = [
+ "diesel",
+ "migrations_internals",
+ "migrations_macros",
+]
+
+[[package]]
+name = "diesel_table_macro_syntax"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5"
+dependencies = [
+ "syn 2.0.32",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@@ -1872,13 +1922,13 @@ dependencies = [
 
 [[package]]
 name = "filetime"
-version = "0.2.21"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153"
+checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
  "windows-sys 0.48.0",
 ]
 
@@ -1895,7 +1945,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
 dependencies = [
  "crc32fast",
- "miniz_oxide 0.7.1",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -2093,9 +2143,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.27.2"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
 [[package]]
 name = "git-version"
@@ -2562,6 +2612,16 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "io-uring"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762"
+dependencies = [
+ "bitflags 1.3.2",
+ "libc",
+]
+
 [[package]]
 name = "ipnet"
 version = "2.9.0"
@@ -2748,18 +2808,18 @@ checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
 [[package]]
 name = "memoffset"
-version = "0.7.1"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
 dependencies = [
  "autocfg",
 ]
 
 [[package]]
 name = "memoffset"
-version = "0.8.0"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
+checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
 dependencies = [
  "autocfg",
 ]
@@ -2775,6 +2835,27 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "migrations_internals"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada"
+dependencies = [
+ "serde",
+ "toml",
+]
+
+[[package]]
+name = "migrations_macros"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08"
+dependencies = [
+ "migrations_internals",
+ "proc-macro2",
+ "quote",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -2797,15 +2878,6 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
-[[package]]
-name = "miniz_oxide"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
-dependencies = [
- "adler",
-]
-
 [[package]]
 name = "miniz_oxide"
 version = "0.7.1"
@@ -2865,16 +2937,14 @@ dependencies = [
 
 [[package]]
 name = "nix"
-version = "0.26.2"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "cfg-if",
  "libc",
- "memoffset 0.7.1",
- "pin-utils",
- "static_assertions",
+ "memoffset 0.9.0",
 ]
 
 [[package]]
@@ -2889,20 +2959,21 @@ dependencies = [
 
 [[package]]
 name = "notify"
-version = "5.2.0"
+version = "6.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486"
+checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
  "crossbeam-channel",
  "filetime",
  "fsevent-sys",
  "inotify 0.9.6",
  "kqueue",
  "libc",
+ "log",
  "mio",
  "walkdir",
- "windows-sys 0.45.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -3028,9 +3099,9 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.30.3"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
  "memchr",
 ]
@@ -3102,9 +3173,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
 dependencies = [
  "opentelemetry_api",
  "opentelemetry_sdk",
@@ -3112,9 +3183,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-http"
-version = "0.8.0"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
+checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3125,54 +3196,56 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-otlp"
-version = "0.12.0"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
+checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
 dependencies = [
  "async-trait",
- "futures",
- "futures-util",
+ "futures-core",
  "http",
- "opentelemetry",
  "opentelemetry-http",
  "opentelemetry-proto",
+ "opentelemetry-semantic-conventions",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
  "prost",
  "reqwest",
  "thiserror",
+ "tokio",
+ "tonic",
 ]
 
 [[package]]
 name = "opentelemetry-proto"
-version = "0.2.0"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
+checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
 dependencies = [
- "futures",
- "futures-util",
- "opentelemetry",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
  "prost",
- "tonic 0.8.3",
+ "tonic",
 ]
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.11.0"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
+checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
 dependencies = [
  "opentelemetry",
 ]
 
 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
 dependencies = [
- "fnv",
  "futures-channel",
  "futures-util",
  "indexmap 1.9.3",
+ "js-sys",
  "once_cell",
  "pin-project-lite",
  "thiserror",
@@ -3181,21 +3254,22 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
 dependencies = [
  "async-trait",
  "crossbeam-channel",
- "dashmap",
- "fnv",
  "futures-channel",
  "futures-executor",
  "futures-util",
  "once_cell",
  "opentelemetry_api",
+ "ordered-float 3.9.2",
  "percent-encoding",
  "rand 0.8.5",
+ "regex",
+ "serde_json",
  "thiserror",
  "tokio",
  "tokio-stream",
@@ -3210,6 +3284,15 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "ordered-float"
+version = "3.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "ordered-multimap"
 version = "0.7.1"
@@ -3325,7 +3408,7 @@ dependencies = [
  "itertools",
  "md5",
  "metrics",
- "nix 0.26.2",
+ "nix 0.27.1",
  "num-traits",
  "num_cpus",
  "once_cell",
@@ -3358,6 +3441,7 @@ dependencies = [
  "tenant_size_model",
  "thiserror",
  "tokio",
+ "tokio-epoll-uring",
  "tokio-io-timeout",
  "tokio-postgres",
  "tokio-stream",
@@ -3780,6 +3864,15 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "pq-sys"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
+dependencies = [
+ "vcpkg",
+]
+
 [[package]]
 name = "pq_proto"
 version = "0.1.0"
@@ -4339,9 +4432,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.4.5"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
+checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5110,9 +5203,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
 
 [[package]]
 name = "smol_str"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
+checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
 dependencies = [
  "serde",
 ]
@@ -5195,7 +5288,7 @@ dependencies = [
  "prost",
  "tokio",
  "tokio-stream",
- "tonic 0.9.2",
+ "tonic",
  "tonic-build",
  "tracing",
  "utils",
@@ -5379,18 +5472,18 @@ dependencies = [
 
 [[package]]
 name = "thiserror"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
+checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.40"
+version = "1.0.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
+checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -5415,7 +5508,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
 dependencies = [
  "byteorder",
  "integer-encoding",
- "ordered-float",
+ "ordered-float 2.10.1",
 ]
 
 [[package]]
@@ -5514,6 +5607,21 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "tokio-epoll-uring"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "futures",
+ "once_cell",
+ "scopeguard",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "uring-common",
+]
+
 [[package]]
 name = "tokio-io-timeout"
 version = "1.2.0"
@@ -5681,38 +5789,6 @@ dependencies = [
  "winnow",
 ]
 
-[[package]]
-name = "tonic"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64 0.13.1",
- "bytes",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost",
- "prost-derive",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
- "tracing-futures",
-]
-
 [[package]]
 name = "tonic"
 version = "0.9.2"
@@ -5856,16 +5932,6 @@ dependencies = [
  "tracing-subscriber",
 ]
 
-[[package]]
-name = "tracing-futures"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2"
-dependencies = [
- "pin-project",
- "tracing",
-]
-
 [[package]]
 name = "tracing-log"
 version = "0.1.3"
@@ -5879,9 +5945,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
 dependencies = [
  "once_cell",
  "opentelemetry",
@@ -6065,6 +6131,15 @@ dependencies = [
  "webpki-roots 0.23.1",
 ]
 
+[[package]]
+name = "uring-common"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+dependencies = [
+ "io-uring",
+ "libc",
+]
+
 [[package]]
 name = "url"
 version = "2.3.1"
@@ -6118,7 +6193,7 @@ dependencies = [
  "hyper",
  "jsonwebtoken",
  "metrics",
- "nix 0.26.2",
+ "nix 0.27.1",
  "once_cell",
  "pin-project-lite",
  "postgres_connection",
@@ -6626,10 +6701,9 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
- "dashmap",
+ "diesel",
  "either",
  "fail",
- "futures",
  "futures-channel",
  "futures-core",
  "futures-executor",
@@ -6674,6 +6748,7 @@ dependencies = [
  "tokio-util",
  "toml_datetime",
  "toml_edit",
+ "tonic",
  "tower",
  "tracing",
  "tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index 5d5d2f4a55..8afab02b15 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -99,14 +99,14 @@ libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
 native-tls = "0.2"
-nix = "0.26"
-notify = "5.0.0"
+nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
+notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.19.0"
-opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.11.0"
+opentelemetry = "0.20.0"
+opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "49.0.0"
@@ -118,7 +118,7 @@ rand = "0.8"
 redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
+reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -151,6 +151,7 @@ test-context = "0.1"
 thiserror = "1.0"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
@@ -162,7 +163,7 @@ toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.19.0"
+tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 908460018f..299c4097e8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -52,7 +52,7 @@ RUN cd postgres && \
     # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
     # In vanilla postgres this function is limited to Postgres role superuser.
     # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
     # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
     # so we do it here.
     old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
@@ -63,14 +63,14 @@ RUN cd postgres && \
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
         fi; \
     done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7, 
+    # the second loop is for pg_stat_statement extension versions >= 1.7,
     # where pg_stat_statement_reset() got 3 additional arguments
     for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
         filename=$(basename "$file"); \
         if ! echo "$old_list" | grep -q -F "$filename"; then \
             echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
         fi; \
-    done      
+    done
 
 #########################################################################################
 #
@@ -143,29 +143,24 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
 #########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
 RUN apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLV8_VERSION=3.1.5 \
-        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
-        ;; \
-      "v16") \
-        export PLV8_VERSION=3.1.8 \
-        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
-    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
+    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
     mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
+    # generate and copy upgrade scripts
+    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
+    cp upgrade/* /usr/local/pgsql/share/extension/ && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
+    # don't break computes with installed old version of plv8
+    cd /usr/local/pgsql/lib/ && \
+    ln -s plv8-3.1.10.so plv8-3.1.5.so && \
+    ln -s plv8-3.1.10.so plv8-3.1.8.so && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -551,6 +546,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
         -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
         -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
         -D RDK_INSTALL_INTREE=OFF \
+        -D RDK_INSTALL_COMIC_FONTS=OFF \
         -D CMAKE_BUILD_TYPE=Release \
         . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -617,6 +613,7 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O
 FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
@@ -779,6 +776,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
+ARG PG_VERSION
+
 # Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 5f5363105c..07e0abe6ff 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -700,13 +700,14 @@ impl ComputeNode {
         // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
-        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
+        let connstr = self.connstr.clone();
+        let mut client = match Client::connect(connstr.as_str(), NoTls) {
             Err(e) => {
                 info!(
                     "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                     e
                 );
-                let mut zenith_admin_connstr = self.connstr.clone();
+                let mut zenith_admin_connstr = connstr.clone();
 
                 zenith_admin_connstr
                     .set_username("zenith_admin")
@@ -719,8 +720,8 @@ impl ComputeNode {
                 client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                 drop(client);
 
-                // reconnect with connsting with expected name
-                Client::connect(self.connstr.as_str(), NoTls)?
+                // reconnect with connstring with expected name
+                Client::connect(connstr.as_str(), NoTls)?
             }
             Ok(client) => client,
         };
@@ -734,8 +735,8 @@ impl ComputeNode {
         cleanup_instance(&mut client)?;
         handle_roles(spec, &mut client)?;
         handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, self.connstr.as_str())?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        handle_grants(spec, &mut client, connstr.as_str())?;
         handle_extensions(spec, &mut client)?;
         handle_extension_neon(&mut client)?;
         create_availability_check_data(&mut client)?;
@@ -743,6 +744,12 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
+        if self.has_feature(ComputeFeature::Migrations) {
+            thread::spawn(move || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_migrations(&mut client)
+            });
+        }
         Ok(())
     }
 
@@ -807,6 +814,10 @@ impl ComputeNode {
             handle_grants(&spec, &mut client, self.connstr.as_str())?;
             handle_extensions(&spec, &mut client)?;
             handle_extension_neon(&mut client)?;
+            // We can skip handle_migrations here because a new migration can only appear
+            // if we have a new version of the compute_ctl binary, which can only happen
+            // if compute got restarted, in which case we'll end up inside of apply_config
+            // instead of reconfigure.
         }
 
         // 'Close' connection
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ef5f55622d..e87dc0b732 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -727,3 +727,79 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
 
     Ok(())
 }
+
+#[instrument(skip_all)]
+pub fn handle_migrations(client: &mut Client) -> Result<()> {
+    info!("handle migrations");
+
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+    let migrations = [
+        "ALTER ROLE neon_superuser BYPASSRLS",
+        r#"
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
+
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
+"#,
+    ];
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    client.simple_query(query)?;
+
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+    client.simple_query(query)?;
+
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+    client.simple_query(query)?;
+
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    client.simple_query(query)?;
+
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    client.simple_query(query)?;
+
+    query = "SELECT id FROM neon_migration.migration_id";
+    let row = client.query_one(query, &[])?;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
+    let starting_migration_id = current_migration;
+
+    query = "BEGIN";
+    client.simple_query(query)?;
+
+    while current_migration < migrations.len() {
+        info!("Running migration:\n{}\n", migrations[current_migration]);
+        client.simple_query(migrations[current_migration])?;
+        current_migration += 1;
+    }
+    let setval = format!(
+        "UPDATE neon_migration.migration_id SET id={}",
+        migrations.len()
+    );
+    client.simple_query(&setval)?;
+
+    query = "COMMIT";
+    client.simple_query(query)?;
+
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );
+    Ok(())
+}
diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml
index 898ad05add..09c171f1d3 100644
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -10,6 +10,8 @@ async-trait.workspace = true
 camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+diesel = { version = "2.1.4", features = ["postgres"]}
+diesel_migrations = { version = "2.1.0", features = ["postgres"]}
 futures.workspace = true
 git-version.workspace = true
 nix.workspace = true
@@ -19,6 +21,7 @@ hex.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
+scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml
index 2e2286dbab..6fc21810bc 100644
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -25,6 +25,8 @@ tracing.workspace = true
 # a parsing function when loading pageservers from neon_local LocalEnv
 postgres_backend.workspace = true
 
+diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
+
 utils = { path = "../../libs/utils/" }
 metrics = { path = "../../libs/metrics/" }
 control_plane = { path = ".." }
diff --git a/control_plane/attachment_service/migrations/.keep b/control_plane/attachment_service/migrations/.keep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
new file mode 100644
index 0000000000..a9f5260911
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
new file mode 100644
index 0000000000..d68895b1a7
--- /dev/null
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+    EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+                    FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+    IF (
+        NEW IS DISTINCT FROM OLD AND
+        NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+    ) THEN
+        NEW.updated_at := current_timestamp;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
new file mode 100644
index 0000000000..b875b91c00
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
@@ -0,0 +1 @@
+DROP TABLE tenant_shards;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
new file mode 100644
index 0000000000..585dbc79a0
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
@@ -0,0 +1,12 @@
+CREATE TABLE tenant_shards (
+  tenant_id VARCHAR NOT NULL,
+  shard_number INTEGER NOT NULL,
+  shard_count INTEGER NOT NULL,
+  PRIMARY KEY(tenant_id, shard_number, shard_count),
+  shard_stripe_size INTEGER NOT NULL,
+  generation INTEGER NOT NULL,
+  generation_pageserver BIGINT NOT NULL,
+  placement_policy VARCHAR NOT NULL,
+  -- config is JSON encoded, opaque to the database.
+  config TEXT NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
new file mode 100644
index 0000000000..ec303bc8cf
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
@@ -0,0 +1 @@
+DROP TABLE nodes;
diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
new file mode 100644
index 0000000000..9be0880fa4
--- /dev/null
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
@@ -0,0 +1,10 @@
+CREATE TABLE nodes (
+  node_id BIGINT PRIMARY KEY NOT NULL,
+
+  scheduling_policy VARCHAR NOT NULL,
+
+  listen_http_addr VARCHAR NOT NULL,
+  listen_http_port INTEGER NOT NULL,
+  listen_pg_addr VARCHAR NOT NULL,
+  listen_pg_port INTEGER NOT NULL
+);
\ No newline at end of file
diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs
index 30f6dd66ee..81f21a8e7a 100644
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,5 +1,5 @@
 use crate::reconciler::ReconcileError;
-use crate::service::Service;
+use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
@@ -104,34 +104,34 @@ async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiErr
     json_response(StatusCode::OK, state.service.inspect(inspect_req))
 }
 
-async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state.service.tenant_create(create_req).await?,
-    )
+    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }
 
-async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_timeline_create(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
-
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_timeline_create(tenant_id, create_req)
             .await?,
     )
 }
 
-async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_locate(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -154,14 +154,15 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, state.service.node_configure(config_req)?)
 }
 
-async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_shard_migrate(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
     let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
-    let state = get_state(&req);
     json_response(
         StatusCode::OK,
-        state
-            .service
+        service
             .tenant_shard_migrate(tenant_shard_id, migrate_req)
             .await?,
     )
@@ -178,6 +179,35 @@ impl From<ReconcileError> for ApiError {
     }
 }
 
+/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
+/// be allowed to run if Service has finished its initial reconciliation.
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
+{
+    let state = get_state(&request);
+    let service = state.service.clone();
+
+    let startup_complete = service.startup_complete.clone();
+    if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait())
+        .await
+        .is_err()
+    {
+        // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate
+        // timeouts around its remote calls, to bound its runtime.
+        return Err(ApiError::Timeout(
+            "Timed out waiting for service readiness".into(),
+        ));
+    }
+
+    request_span(
+        request,
+        |request| async move { handler(service, request).await },
+    )
+    .await
+}
+
 pub fn make_router(
     service: Arc<Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
@@ -205,14 +235,20 @@ pub fn make_router(
         .put("/node/:node_id/config", |r| {
             request_span(r, handle_node_configure)
         })
-        .post("/tenant", |r| request_span(r, handle_tenant_create))
-        .post("/tenant/:tenant_id/timeline", |r| {
-            request_span(r, handle_tenant_timeline_create)
+        .post("/v1/tenant", |r| {
+            tenant_service_handler(r, handle_tenant_create)
+        })
+        .post("/v1/tenant/:tenant_id/timeline", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_create)
         })
         .get("/tenant/:tenant_id/locate", |r| {
-            request_span(r, handle_tenant_locate)
+            tenant_service_handler(r, handle_tenant_locate)
         })
         .put("/tenant/:tenant_shard_id/migrate", |r| {
-            request_span(r, handle_tenant_shard_migrate)
+            tenant_service_handler(r, handle_tenant_shard_migrate)
         })
+        // Path aliases for tests_forward_compatibility
+        // TODO: remove these in future PR
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
 }
diff --git a/control_plane/attachment_service/src/lib.rs b/control_plane/attachment_service/src/lib.rs
index d8f996952a..082afb4157 100644
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -7,6 +7,7 @@ mod node;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
+mod schema;
 pub mod service;
 mod tenant_state;
 
@@ -17,6 +18,8 @@ enum PlacementPolicy {
     /// Production-ready way to attach a tenant: one attached pageserver and
     /// some number of secondaries.
     Double(usize),
+    /// Do not attach to any pageservers
+    Detached,
 }
 
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs
index ee2a22ee53..05a3895dfa 100644
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -12,9 +12,9 @@ use camino::Utf8PathBuf;
 use clap::Parser;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
+use tokio::signal::unix::SignalKind;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};
 
 use utils::{project_build_tag, project_git_version, tcp_listener};
 
@@ -40,6 +40,10 @@ struct Cli {
     /// Path to the .json file to store state (will be created if it doesn't exist)
     #[arg(short, long)]
     path: Utf8PathBuf,
+
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
+    #[arg(long)]
+    database_url: String,
 }
 
 #[tokio::main]
@@ -66,9 +70,14 @@ async fn main() -> anyhow::Result<()> {
         jwt_token: args.jwt_token,
     };
 
-    let persistence = Arc::new(Persistence::new(&args.path).await);
+    let json_path = if args.path.as_os_str().is_empty() {
+        None
+    } else {
+        Some(args.path)
+    };
+    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));
 
-    let service = Service::spawn(config, persistence).await?;
+    let service = Service::spawn(config, persistence.clone()).await?;
 
     let http_listener = tcp_listener::bind(args.listen)?;
 
@@ -81,20 +90,31 @@ async fn main() -> anyhow::Result<()> {
     let router = make_router(service, auth)
         .build()
         .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+    let router_service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);
 
     tracing::info!("Serving on {0}", args.listen);
 
     tokio::task::spawn(server);
 
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
+    // Wait until we receive a signal
+    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
+    let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?;
+    let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?;
+    tokio::select! {
+        _ = sigint.recv() => {},
+        _ = sigterm.recv() => {},
+        _ = sigquit.recv() => {},
+    }
+    tracing::info!("Terminating on signal");
 
-    Ok(())
+    if json_path.is_some() {
+        // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing
+        // full postgres dumps around.
+        if let Err(e) = persistence.write_tenants_json().await {
+            tracing::error!("Failed to write JSON on shutdown: {e}")
+        }
+    }
+
+    std::process::exit(0);
 }
diff --git a/control_plane/attachment_service/src/node.rs b/control_plane/attachment_service/src/node.rs
index efd3f8f49b..47f61702d8 100644
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,6 +1,8 @@
 use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
 use utils::id::NodeId;
 
+use crate::persistence::NodePersistence;
+
 #[derive(Clone)]
 pub(crate) struct Node {
     pub(crate) id: NodeId,
@@ -34,4 +36,15 @@ impl Node {
             NodeSchedulingPolicy::Pause => false,
         }
     }
+
+    pub(crate) fn to_persistent(&self) -> NodePersistence {
+        NodePersistence {
+            node_id: self.id.0 as i64,
+            scheduling_policy: self.scheduling.into(),
+            listen_http_addr: self.listen_http_addr.clone(),
+            listen_http_port: self.listen_http_port as i32,
+            listen_pg_addr: self.listen_pg_addr.clone(),
+            listen_pg_port: self.listen_pg_port as i32,
+        }
+    }
 }
diff --git a/control_plane/attachment_service/src/persistence.rs b/control_plane/attachment_service/src/persistence.rs
index 58708be140..b27bd2bf2e 100644
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -1,139 +1,161 @@
-use std::{collections::HashMap, str::FromStr};
+use std::collections::HashMap;
+use std::str::FromStr;
 
-use camino::{Utf8Path, Utf8PathBuf};
-use control_plane::{
-    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
-    local_env::LocalEnv,
-};
-use pageserver_api::{
-    models::TenantConfig,
-    shard::{ShardCount, ShardNumber, TenantShardId},
-};
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use diesel::pg::PgConnection;
+use diesel::prelude::*;
+use diesel::Connection;
+use pageserver_api::models::TenantConfig;
+use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
-use utils::{
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
+use utils::generation::Generation;
+use utils::id::{NodeId, TenantId};
 
-use crate::{node::Node, PlacementPolicy};
+use crate::node::Node;
+use crate::PlacementPolicy;
 
-/// Placeholder for storage.  This will be replaced with a database client.
+/// ## What do we store?
+///
+/// The attachment service does not store most of its state durably.
+///
+/// The essential things to store durably are:
+/// - generation numbers, as these must always advance monotonically to ensure data safety.
+/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external.
+/// - Node's scheduling policies, as the source of truth for these is something external.
+///
+/// Other things we store durably as an implementation detail:
+/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat,
+///   but it is operationally simpler to make this service the authority for which nodes
+///   it talks to.
+///
+/// ## Performance/efficiency
+///
+/// The attachment service does not go via the database for most things: there are
+/// a couple of places where we must, and where efficiency matters:
+/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
+///   before it can attach a tenant, so this acts as a bound on how fast things like
+///   failover can happen.
+/// - Pageserver re-attach: we will increment many shards' generations when this happens,
+///   so it is important to avoid e.g. issuing O(N) queries.
+///
+/// Database calls relating to nodes have low performance requirements, as they are very rarely
+/// updated, and reads of nodes are always from memory, not the database.  We only require that
+/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
 pub struct Persistence {
-    state: std::sync::Mutex<PersistentState>,
+    database_url: String,
+
+    // In test environments, we support loading+saving a JSON file.  This is temporary, for the benefit of
+    // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward
+    // compatible just yet.
+    json_path: Option<Utf8PathBuf>,
 }
 
-// Top level state available to all HTTP handlers
+/// Legacy format, for use in JSON compat objects in test environment
 #[derive(Serialize, Deserialize)]
-struct PersistentState {
+struct JsonPersistence {
     tenants: HashMap<TenantShardId, TenantShardPersistence>,
-
-    #[serde(skip)]
-    path: Utf8PathBuf,
 }
 
-/// A convenience for serializing the state inside a sync lock, and then
-/// writing it to disk outside of the lock.  This will go away when switching
-/// to a database backend.
-struct PendingWrite {
-    bytes: Vec<u8>,
-    path: Utf8PathBuf,
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum DatabaseError {
+    #[error(transparent)]
+    Query(#[from] diesel::result::Error),
+    #[error(transparent)]
+    Connection(#[from] diesel::result::ConnectionError),
+    #[error("Logical error: {0}")]
+    Logical(String),
 }
 
-impl PendingWrite {
-    async fn commit(&self) -> anyhow::Result<()> {
-        tokio::fs::write(&self.path, &self.bytes).await?;
-
-        Ok(())
-    }
-}
-
-impl PersistentState {
-    fn save(&self) -> PendingWrite {
-        PendingWrite {
-            bytes: serde_json::to_vec(self).expect("Serialization error"),
-            path: self.path.clone(),
-        }
-    }
-
-    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-        decoded.path = path.to_owned();
-
-        for (tenant_id, tenant) in &mut decoded.tenants {
-            // Backward compat: an old attachments.json from before PR #6251, replace
-            // empty strings with proper defaults.
-            if tenant.tenant_id.is_empty() {
-                tenant.tenant_id = format!("{}", tenant_id);
-                tenant.config = serde_json::to_string(&TenantConfig::default())?;
-                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
-            }
-        }
-
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Utf8Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path);
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path);
-                Self {
-                    tenants: HashMap::new(),
-                    path: path.to_owned(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
-            }
-        }
-    }
-}
+pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 
 impl Persistence {
-    pub async fn new(path: &Utf8Path) -> Self {
-        let state = PersistentState::load_or_new(path).await;
+    pub fn new(database_url: String, json_path: Option<Utf8PathBuf>) -> Self {
         Self {
-            state: std::sync::Mutex::new(state),
+            database_url,
+            json_path,
         }
     }
 
-    /// When registering a node, persist it so that on next start we will be able to
-    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
-    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
-        // TODO: node persitence will come with database backend
-        Ok(())
+    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
+    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
+    where
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        R: Send + 'static,
+    {
+        let database_url = self.database_url.clone();
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            // TODO: connection pooling, such as via diesel::r2d2
+            let mut conn = PgConnection::establish(&database_url)?;
+            func(&mut conn)
+        })
+        .await
+        .expect("Task panic")
     }
 
-    /// At startup, we populate the service's list of nodes, and use this list to call into
-    /// each node to do an initial reconciliation of the state of the world with our in-memory
-    /// observed state.
-    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
-        let env = LocalEnv::load_config()?;
-        // TODO: node persitence will come with database backend
+    /// When a node is first registered, persist it before using it for anything
+    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
+        let np = node.to_persistent();
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
+        .await
+    }
 
-        // XXX hack: enable test_backward_compatibility to work by populating our list of
+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
+        let nodes: Vec<Node> = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table
+                    .load::<NodePersistence>(conn)?
+                    .into_iter()
+                    .map(|n| Node {
+                        id: NodeId(n.node_id as u64),
+                        // At startup we consider a node offline until proven otherwise.
+                        availability: NodeAvailability::Offline,
+                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                            .expect("Bad scheduling policy in DB"),
+                        listen_http_addr: n.listen_http_addr,
+                        listen_http_port: n.listen_http_port as u16,
+                        listen_pg_addr: n.listen_pg_addr,
+                        listen_pg_port: n.listen_pg_port as u16,
+                    })
+                    .collect::<Vec<Node>>())
+            })
+            .await?;
+
+        if nodes.is_empty() {
+            return self.list_nodes_local_env().await;
+        }
+
+        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
+
+        Ok(nodes)
+    }
+
+    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
+    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
+        // Enable test_backward_compatibility to work by populating our list of
         // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
         // first startup in the compat test, we may have shards but no nodes.
-        let mut result = Vec::new();
+        use control_plane::local_env::LocalEnv;
+        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
         tracing::info!(
-            "Loaded {} pageserver nodes from LocalEnv",
+            "Loading {} pageserver nodes from LocalEnv",
             env.pageservers.len()
         );
+        let mut nodes = Vec::new();
         for ps_conf in env.pageservers {
             let (pg_host, pg_port) =
                 parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
             let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
                 .expect("Unable to parse listen_http_addr");
-            result.push(Node {
+            let node = Node {
                 id: ps_conf.id,
                 listen_pg_addr: pg_host.to_string(),
                 listen_pg_port: pg_port.unwrap_or(5432),
@@ -141,16 +163,96 @@ impl Persistence {
                 listen_http_port: http_port.unwrap_or(80),
                 availability: NodeAvailability::Active,
                 scheduling: NodeSchedulingPolicy::Active,
-            });
+            };
+
+            // Synchronize database with what we learn from LocalEnv
+            self.insert_node(&node).await?;
+
+            nodes.push(node);
         }
 
-        Ok(result)
+        Ok(nodes)
     }
 
-    /// At startup, we populate our map of tenant shards from persistent storage.
-    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
-        let locked = self.state.lock().unwrap();
-        Ok(locked.tenants.values().cloned().collect())
+    /// At startup, load the high level state for shards, such as their config + policy.  This will
+    /// be enriched at runtime with state discovered on pageservers.
+    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let loaded = self
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
+            .await?;
+
+        if loaded.is_empty() {
+            if let Some(path) = &self.json_path {
+                if tokio::fs::try_exists(path)
+                    .await
+                    .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))?
+                {
+                    tracing::info!("Importing from legacy JSON format at {path}");
+                    return self.list_tenant_shards_json(path).await;
+                }
+            }
+        }
+        Ok(loaded)
+    }
+
+    /// Shim for automated compatibility tests: load tenants from a JSON file instead of database
+    pub(crate) async fn list_tenant_shards_json(
+        &self,
+        path: &Utf8Path,
+    ) -> DatabaseResult<Vec<TenantShardPersistence>> {
+        let bytes = tokio::fs::read(path)
+            .await
+            .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?;
+
+        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
+            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+            }
+        }
+
+        let tenants: Vec<TenantShardPersistence> = decoded.tenants.into_values().collect();
+
+        // Synchronize database with what is in the JSON file
+        self.insert_tenant_shards(tenants.clone()).await?;
+
+        Ok(tenants)
+    }
+
+    /// For use in testing environments, where we dump out JSON on shutdown.
+    pub async fn write_tenants_json(&self) -> anyhow::Result<()> {
+        let Some(path) = &self.json_path else {
+            anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)");
+        };
+        tracing::info!("Writing state to {path}...");
+        let tenants = self.list_tenant_shards().await?;
+        let mut tenants_map = HashMap::new();
+        for tsp in tenants {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+
+            tenants_map.insert(tenant_shard_id, tsp);
+        }
+        let json = serde_json::to_string(&JsonPersistence {
+            tenants: tenants_map,
+        })?;
+
+        tokio::fs::write(path, &json).await?;
+        tracing::info!("Wrote {} bytes to {path}...", json.len());
+
+        Ok(())
     }
 
     /// Tenants must be persisted before we schedule them for the first time.  This enables us
@@ -158,24 +260,77 @@ impl Persistence {
     pub(crate) async fn insert_tenant_shards(
         &self,
         shards: Vec<TenantShardPersistence>,
-    ) -> anyhow::Result<()> {
-        let write = {
-            let mut locked = self.state.lock().unwrap();
-            for shard in shards {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
-                    shard_number: ShardNumber(shard.shard_number as u8),
-                    shard_count: ShardCount(shard.shard_count as u8),
-                };
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
+                Ok(())
+            })?;
+            Ok(())
+        })
+        .await
+    }
 
-                locked.tenants.insert(tenant_shard_id, shard);
-            }
-            locked.save()
-        };
+    /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
+    /// the tenant from memory on this server.
+    #[allow(unused)]
+    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;
 
-        write.commit().await?;
+            Ok(())
+        })
+        .await
+    }
 
-        Ok(())
+    /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
+    /// batched increment of the generations of all tenants whose generation_pageserver is equal to
+    /// the node that called /re-attach.
+    #[tracing::instrument(skip_all, fields(node_id))]
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let rows_updated = diesel::update(tenant_shards)
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .set(generation.eq(generation + 1))
+                    .execute(conn)?;
+
+                tracing::info!("Incremented {} tenants' generations", rows_updated);
+
+                // TODO: UPDATE+SELECT in one query
+
+                let updated = tenant_shards
+                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .select(TenantShardPersistence::as_select())
+                    .load(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        let mut result = HashMap::new();
+        for tsp in updated {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
+                    .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
+        }
+
+        Ok(result)
     }
 
     /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
@@ -184,49 +339,48 @@ impl Persistence {
     pub(crate) async fn increment_generation(
         &self,
         tenant_shard_id: TenantShardId,
-        node_id: Option<NodeId>,
+        node_id: NodeId,
     ) -> anyhow::Result<Generation> {
-        let (write, gen) = {
-            let mut locked = self.state.lock().unwrap();
-            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
-                anyhow::bail!("Tried to increment generation of unknown shard");
-            };
+        use crate::schema::tenant_shards::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .set((
+                        generation.eq(generation + 1),
+                        generation_pageserver.eq(node_id.0 as i64),
+                    ))
+                    // TODO: only returning() the generation column
+                    .returning(TenantShardPersistence::as_returning())
+                    .get_result(conn)?;
 
-            // If we're called with a None pageserver, we need only update the generation
-            // record to disassociate it with this pageserver, not actually increment the number, as
-            // the increment is guaranteed to happen the next time this tenant is attached.
-            if node_id.is_some() {
-                shard.generation += 1;
-            }
+                Ok(updated)
+            })
+            .await?;
 
-            shard.generation_pageserver = node_id;
-            let gen = Generation::new(shard.generation);
-            (locked.save(), gen)
-        };
-
-        write.commit().await?;
-        Ok(gen)
+        Ok(Generation::new(updated.generation as u32))
     }
 
-    pub(crate) async fn re_attach(
-        &self,
-        node_id: NodeId,
-    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
-        let (write, result) = {
-            let mut result = HashMap::new();
-            let mut locked = self.state.lock().unwrap();
-            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
-                if shard.generation_pageserver == Some(node_id) {
-                    shard.generation += 1;
-                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
-                }
-            }
+    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        use crate::schema::tenant_shards::dsl::*;
+        self.with_conn(move |conn| {
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .set((
+                    generation_pageserver.eq(i64::MAX),
+                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
+                ))
+                .execute(conn)?;
 
-            (locked.save(), result)
-        };
+            Ok(updated)
+        })
+        .await?;
 
-        write.commit().await?;
-        Ok(result)
+        Ok(())
     }
 
     // TODO: when we start shard splitting, we must durably mark the tenant so that
@@ -246,7 +400,8 @@ impl Persistence {
 }
 
 /// Parts of [`crate::tenant_state::TenantState`] that are stored durably
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone)]
+#[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) tenant_id: String,
@@ -257,16 +412,28 @@ pub(crate) struct TenantShardPersistence {
     #[serde(default)]
     pub(crate) shard_stripe_size: i32,
 
-    // Currently attached pageserver
-    #[serde(rename = "pageserver")]
-    pub(crate) generation_pageserver: Option<NodeId>,
-
     // Latest generation number: next time we attach, increment this
     // and use the incremented number when attaching
-    pub(crate) generation: u32,
+    pub(crate) generation: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: i64,
 
     #[serde(default)]
     pub(crate) placement_policy: String,
     #[serde(default)]
     pub(crate) config: String,
 }
+
+/// Parts of [`crate::node::Node`] that are stored durably
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[diesel(table_name = crate::schema::nodes)]
+pub(crate) struct NodePersistence {
+    pub(crate) node_id: i64,
+    pub(crate) scheduling_policy: String,
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: i32,
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: i32,
+}
diff --git a/control_plane/attachment_service/src/reconciler.rs b/control_plane/attachment_service/src/reconciler.rs
index b08339b3b4..d7f4c0406a 100644
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -296,7 +296,7 @@ impl Reconciler {
         // Increment generation before attaching to new pageserver
         self.generation = self
             .persistence
-            .increment_generation(self.tenant_shard_id, Some(dest_ps_id))
+            .increment_generation(self.tenant_shard_id, dest_ps_id)
             .await?;
 
         let dest_conf = build_location_config(
@@ -395,7 +395,7 @@ impl Reconciler {
                     // as locations with unknown (None) observed state.
                     self.generation = self
                         .persistence
-                        .increment_generation(self.tenant_shard_id, Some(node_id))
+                        .increment_generation(self.tenant_shard_id, node_id)
                         .await?;
                     wanted_conf.generation = self.generation.into();
                     tracing::info!("Observed configuration requires update.");
diff --git a/control_plane/attachment_service/src/schema.rs b/control_plane/attachment_service/src/schema.rs
new file mode 100644
index 0000000000..de80fc8f64
--- /dev/null
+++ b/control_plane/attachment_service/src/schema.rs
@@ -0,0 +1,27 @@
+// @generated automatically by Diesel CLI.
+
+diesel::table! {
+    nodes (node_id) {
+        node_id -> Int8,
+        scheduling_policy -> Varchar,
+        listen_http_addr -> Varchar,
+        listen_http_port -> Int4,
+        listen_pg_addr -> Varchar,
+        listen_pg_port -> Int4,
+    }
+}
+
+diesel::table! {
+    tenant_shards (tenant_id, shard_number, shard_count) {
+        tenant_id -> Varchar,
+        shard_number -> Int4,
+        shard_count -> Int4,
+        shard_stripe_size -> Int4,
+        generation -> Int4,
+        generation_pageserver -> Int8,
+        placement_policy -> Varchar,
+        config -> Text,
+    }
+}
+
+diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);
diff --git a/control_plane/attachment_service/src/service.rs b/control_plane/attachment_service/src/service.rs
index 5999d48fd9..ec56dc8ad4 100644
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -11,6 +11,7 @@ use control_plane::attachment_service::{
     TenantCreateResponseShard, TenantLocateResponse, TenantLocateResponseShard,
     TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
+use diesel::result::DatabaseErrorKind;
 use hyper::StatusCode;
 use pageserver_api::{
     control_api::{
@@ -26,6 +27,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use utils::{
+    completion::Barrier,
     generation::Generation,
     http::error::ApiError,
     id::{NodeId, TenantId},
@@ -35,7 +37,7 @@ use utils::{
 use crate::{
     compute_hook::ComputeHook,
     node::Node,
-    persistence::{Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, Persistence, TenantShardPersistence},
     scheduler::Scheduler,
     tenant_state::{
         IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -46,6 +48,10 @@ use crate::{
 
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
 
+/// How long [`Service::startup_reconcile`] is allowed to take before it should give
+/// up on unresponsive pageservers and proceed.
+pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
+
 // Top level state available to all HTTP handlers
 struct ServiceState {
     tenants: BTreeMap<TenantShardId, TenantState>,
@@ -79,10 +85,27 @@ pub struct Config {
     pub jwt_token: Option<String>,
 }
 
+impl From<DatabaseError> for ApiError {
+    fn from(err: DatabaseError) -> ApiError {
+        match err {
+            DatabaseError::Query(e) => ApiError::InternalServerError(e.into()),
+            // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503.
+            DatabaseError::Connection(_e) => ApiError::ShuttingDown,
+            DatabaseError::Logical(reason) => {
+                ApiError::InternalServerError(anyhow::anyhow!(reason))
+            }
+        }
+    }
+}
+
 pub struct Service {
     inner: Arc<std::sync::RwLock<ServiceState>>,
     config: Config,
     persistence: Arc<Persistence>,
+
+    /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
+    /// passes, it isn't safe to do any actions that mutate tenants.
+    pub(crate) startup_complete: Barrier,
 }
 
 impl From<ReconcileWaitError> for ApiError {
@@ -96,77 +119,32 @@ impl From<ReconcileWaitError> for ApiError {
 }
 
 impl Service {
-    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-        tracing::info!("Loading nodes from database...");
-        let mut nodes = persistence.list_nodes().await?;
-        tracing::info!("Loaded {} nodes from database.", nodes.len());
-
-        tracing::info!("Loading shards from database...");
-        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
-        tracing::info!(
-            "Loaded {} shards from database.",
-            tenant_shard_persistence.len()
-        );
-
-        let mut tenants = BTreeMap::new();
-
-        for tsp in tenant_shard_persistence {
-            let tenant_shard_id = TenantShardId {
-                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
-                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
-            };
-            let shard_identity = if tsp.shard_count == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                ShardIdentity::new(
-                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
-                    ShardStripeSize(tsp.shard_stripe_size as u32),
-                )?
-            };
-            let new_tenant = TenantState {
-                tenant_shard_id,
-                shard: shard_identity,
-                sequence: Sequence::initial(),
-                // Note that we load generation, but don't care about generation_pageserver.  We will either end up finding
-                // our existing attached location and it will match generation_pageserver, or we will attach somewhere new
-                // and update generation_pageserver in the process.
-                generation: Generation::new(tsp.generation),
-                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-                intent: IntentState::new(),
-                observed: ObservedState::new(),
-                config: serde_json::from_str(&tsp.config).unwrap(),
-                reconciler: None,
-                waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-                last_error: Arc::default(),
-            };
-
-            tenants.insert(tenant_shard_id, new_tenant);
-        }
+    pub fn get_config(&self) -> &Config {
+        &self.config
+    }
 
+    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
+    /// until this is done.
+    async fn startup_reconcile(&self) {
         // For all tenant shards, a vector of observed states on nodes (where None means
         // indeterminate, same as in [`ObservedStateLocation`])
         let mut observed = HashMap::new();
 
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
         // TODO: issue these requests concurrently
-        for node in &mut nodes {
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+        for node in nodes.values() {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
 
             tracing::info!("Scanning shards on node {}...", node.id);
             match client.list_location_config().await {
                 Err(e) => {
                     tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                    // TODO: be more tolerant, apply a generous 5-10 second timeout
-                    // TODO: setting a node to Offline is a dramatic thing to do, and can
-                    // prevent neon_local from starting up (it starts this service before
-                    // any pageservers are  running).  It may make sense to give nodes
-                    // a Pending state to accomodate this situation, and allow (but deprioritize)
-                    // scheduling on Pending nodes.
-                    //node.availability = NodeAvailability::Offline;
+                    // TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
+                    // pageserver is being restarted at the same time as we are
                 }
                 Ok(listing) => {
                     tracing::info!(
@@ -174,7 +152,6 @@ impl Service {
                         listing.tenant_shards.len(),
                         node.id
                     );
-                    node.availability = NodeAvailability::Active;
 
                     for (tenant_shard_id, conf_opt) in listing.tenant_shards {
                         observed.insert(tenant_shard_id, (node.id, conf_opt));
@@ -186,41 +163,46 @@ impl Service {
         let mut cleanup = Vec::new();
 
         // Populate intent and observed states for all tenants, based on reported state on pageservers
-        for (tenant_shard_id, (node_id, observed_loc)) in observed {
-            let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else {
-                cleanup.push((tenant_shard_id, node_id));
-                continue;
-            };
+        let shard_count = {
+            let mut locked = self.inner.write().unwrap();
+            for (tenant_shard_id, (node_id, observed_loc)) in observed {
+                let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
+                    cleanup.push((tenant_shard_id, node_id));
+                    continue;
+                };
 
-            tenant_state
-                .observed
-                .locations
-                .insert(node_id, ObservedStateLocation { conf: observed_loc });
-        }
-
-        // State of nodes is now frozen, transform to a HashMap.
-        let mut nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
-
-        // Populate each tenant's intent state
-        let mut scheduler = Scheduler::new(&tenants, &nodes);
-        for (tenant_shard_id, tenant_state) in tenants.iter_mut() {
-            tenant_state.intent_from_observed();
-            if let Err(e) = tenant_state.schedule(&mut scheduler) {
-                // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
-                // not enough pageservers are available.  The tenant may well still be available
-                // to clients.
-                tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                tenant_state
+                    .observed
+                    .locations
+                    .insert(node_id, ObservedStateLocation { conf: observed_loc });
             }
-        }
+
+            // Populate each tenant's intent state
+            let mut scheduler = Scheduler::new(&locked.tenants, &nodes);
+            for (tenant_shard_id, tenant_state) in locked.tenants.iter_mut() {
+                tenant_state.intent_from_observed();
+                if let Err(e) = tenant_state.schedule(&mut scheduler) {
+                    // Non-fatal error: we are unable to properly schedule the tenant, perhaps because
+                    // not enough pageservers are available.  The tenant may well still be available
+                    // to clients.
+                    tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}");
+                }
+            }
+
+            locked.tenants.len()
+        };
+
+        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
+        // generation_pageserver in the database.
 
         // Clean up any tenants that were found on pageservers but are not known to us.
         for (tenant_shard_id, node_id) in cleanup {
             // A node reported a tenant_shard_id which is unknown to us: detach it.
             let node = nodes
-                .get_mut(&node_id)
+                .get(&node_id)
                 .expect("Always exists: only known nodes are scanned");
 
-            let client = mgmt_api::Client::new(node.base_url(), config.jwt_token.as_deref());
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
             match client
                 .location_config(
                     tenant_shard_id,
@@ -252,13 +234,80 @@ impl Service {
             }
         }
 
-        let shard_count = tenants.len();
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted.
+        let reconcile_tasks = self.reconcile_all();
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
+        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        tracing::info!("Loading nodes from database...");
+        let nodes = persistence.list_nodes().await?;
+        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
+        tracing::info!("Loaded {} nodes from database.", nodes.len());
+
+        tracing::info!("Loading shards from database...");
+        let tenant_shard_persistence = persistence.list_tenant_shards().await?;
+        tracing::info!(
+            "Loaded {} shards from database.",
+            tenant_shard_persistence.len()
+        );
+
+        let mut tenants = BTreeMap::new();
+
+        for tsp in tenant_shard_persistence {
+            let tenant_shard_id = TenantShardId {
+                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
+                shard_number: ShardNumber(tsp.shard_number as u8),
+                shard_count: ShardCount(tsp.shard_count as u8),
+            };
+            let shard_identity = if tsp.shard_count == 0 {
+                ShardIdentity::unsharded()
+            } else {
+                ShardIdentity::new(
+                    ShardNumber(tsp.shard_number as u8),
+                    ShardCount(tsp.shard_count as u8),
+                    ShardStripeSize(tsp.shard_stripe_size as u32),
+                )?
+            };
+
+            // We will populate intent properly later in [`Self::startup_reconcile`], initially populate
+            // it with what we can infer: the node for which a generation was most recently issued.
+            let mut intent = IntentState::new();
+            if tsp.generation_pageserver != i64::MAX {
+                intent.attached = Some(NodeId(tsp.generation_pageserver as u64))
+            }
+
+            let new_tenant = TenantState {
+                tenant_shard_id,
+                shard: shard_identity,
+                sequence: Sequence::initial(),
+                generation: Generation::new(tsp.generation as u32),
+                policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
+                intent,
+                observed: ObservedState::new(),
+                config: serde_json::from_str(&tsp.config).unwrap(),
+                reconciler: None,
+                waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
+                last_error: Arc::default(),
+            };
+
+            tenants.insert(tenant_shard_id, new_tenant);
+        }
+
+        let (startup_completion, startup_complete) = utils::completion::channel();
+
         let this = Arc::new(Self {
             inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                 result_tx, nodes, tenants,
             ))),
             config,
             persistence,
+            startup_complete,
         });
 
         let result_task_this = this.clone();
@@ -316,11 +365,13 @@ impl Service {
             }
         });
 
-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted.
-        let reconcile_tasks = this.reconcile_all();
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        let startup_reconcile_this = this.clone();
+        tokio::task::spawn(async move {
+            // Block the [`Service::startup_complete`] barrier until we're done
+            let _completion = startup_completion;
+
+            startup_reconcile_this.startup_reconcile().await
+        });
 
         Ok(this)
     }
@@ -336,7 +387,6 @@ impl Service {
             let locked = self.inner.write().unwrap();
             !locked.tenants.contains_key(&attach_req.tenant_shard_id)
         };
-
         if insert {
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
@@ -344,31 +394,49 @@ impl Service {
                 shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: 0,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
                 config: serde_json::to_string(&TenantConfig::default()).unwrap(),
             };
 
-            self.persistence.insert_tenant_shards(vec![tsp]).await?;
+            match self.persistence.insert_tenant_shards(vec![tsp]).await {
+                Err(e) => match e {
+                    DatabaseError::Query(diesel::result::Error::DatabaseError(
+                        DatabaseErrorKind::UniqueViolation,
+                        _,
+                    )) => {
+                        tracing::info!(
+                            "Raced with another request to insert tenant {}",
+                            attach_req.tenant_shard_id
+                        )
+                    }
+                    _ => return Err(e.into()),
+                },
+                Ok(()) => {
+                    tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
 
-            let mut locked = self.inner.write().unwrap();
-            locked.tenants.insert(
-                attach_req.tenant_shard_id,
-                TenantState::new(
-                    attach_req.tenant_shard_id,
-                    ShardIdentity::unsharded(),
-                    PlacementPolicy::Single,
-                ),
-            );
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(
+                        attach_req.tenant_shard_id,
+                        TenantState::new(
+                            attach_req.tenant_shard_id,
+                            ShardIdentity::unsharded(),
+                            PlacementPolicy::Single,
+                        ),
+                    );
+                    tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
+                }
+            }
         }
 
-        let new_generation = if attach_req.node_id.is_some() {
+        let new_generation = if let Some(req_node_id) = attach_req.node_id {
             Some(
                 self.persistence
-                    .increment_generation(attach_req.tenant_shard_id, attach_req.node_id)
+                    .increment_generation(attach_req.tenant_shard_id, req_node_id)
                     .await?,
             )
         } else {
+            self.persistence.detach(attach_req.tenant_shard_id).await?;
             None
         };
 
@@ -380,6 +448,11 @@ impl Service {
 
         if let Some(new_generation) = new_generation {
             tenant_state.generation = new_generation;
+        } else {
+            // This is a detach notification.  We must update placement policy to avoid re-attaching
+            // during background scheduling/reconciliation, or during attachment service restart.
+            assert!(attach_req.node_id.is_none());
+            tenant_state.policy = PlacementPolicy::Detached;
         }
 
         if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
@@ -407,6 +480,7 @@ impl Service {
             "attach_hook: tenant {} set generation {:?}, pageserver {}",
             attach_req.tenant_shard_id,
             tenant_state.generation,
+            // TODO: this is an odd number of 0xf's
             attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
         );
 
@@ -499,6 +573,14 @@ impl Service {
                     id: req_tenant.id,
                     valid,
                 });
+            } else {
+                // After tenant deletion, we may approve any validation.  This avoids
+                // spurious warnings on the pageserver if it has pending LSN updates
+                // at the point a deletion happens.
+                response.tenants.push(ValidateResponseTenant {
+                    id: req_tenant.id,
+                    valid: true,
+                });
             }
         }
         response
@@ -554,7 +636,7 @@ impl Service {
                 shard_count: tenant_shard_id.shard_count.0 as i32,
                 shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                 generation: 0,
-                generation_pageserver: None,
+                generation_pageserver: i64::MAX,
                 placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                 config: serde_json::to_string(&create_req.config).unwrap(),
             })
@@ -868,7 +950,6 @@ impl Service {
             } else {
                 let old_attached = shard.intent.attached;
 
-                shard.intent.attached = Some(migrate_req.node_id);
                 match shard.policy {
                     PlacementPolicy::Single => {
                         shard.intent.secondary.clear();
@@ -882,7 +963,13 @@ impl Service {
                             shard.intent.secondary.push(old_attached);
                         }
                     }
+                    PlacementPolicy::Detached => {
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
+                        )))
+                    }
                 }
+                shard.intent.attached = Some(migrate_req.node_id);
 
                 tracing::info!("Migrating: new intent {:?}", shard.intent);
                 shard.sequence = shard.sequence.next();
@@ -955,10 +1042,7 @@ impl Service {
             availability: NodeAvailability::Active,
         };
         // TODO: idempotency if the node already exists in the database
-        self.persistence
-            .insert_node(&new_node)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+        self.persistence.insert_node(&new_node).await?;
 
         let mut locked = self.inner.write().unwrap();
         let mut new_nodes = (*locked.nodes).clone();
diff --git a/control_plane/attachment_service/src/tenant_state.rs b/control_plane/attachment_service/src/tenant_state.rs
index a907628eff..5290197d84 100644
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -312,6 +312,18 @@ impl TenantState {
                     modified = true;
                 }
             }
+            Detached => {
+                // Should have no attached or secondary pageservers
+                if self.intent.attached.is_some() {
+                    self.intent.attached = None;
+                    modified = true;
+                }
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.secondary.clear();
+                    modified = true;
+                }
+            }
         }
 
         if modified {
diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs
index 0a353d8b12..6602aa9a73 100644
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,5 +1,11 @@
 use crate::{background_process, local_env::LocalEnv};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
+use diesel::{
+    backend::Backend,
+    query_builder::{AstPass, QueryFragment, QueryId},
+    Connection, PgConnection, QueryResult, RunQueryDsl,
+};
+use diesel_migrations::{HarnessWithOutput, MigrationHarness};
 use hyper::Method;
 use pageserver_api::{
     models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
@@ -7,9 +13,9 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
-use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{path::PathBuf, process::Child, str::FromStr};
+use std::{env, str::FromStr};
+use tokio::process::Command;
 use tracing::instrument;
 use utils::{
     auth::{Claims, Scope},
@@ -19,14 +25,17 @@ use utils::{
 pub struct AttachmentService {
     env: LocalEnv,
     listen: String,
-    path: PathBuf,
+    path: Utf8PathBuf,
     jwt_token: Option<String>,
     public_key_path: Option<Utf8PathBuf>,
+    postgres_port: u16,
     client: reqwest::Client,
 }
 
 const COMMAND: &str = "attachment_service";
 
+const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
@@ -169,7 +178,9 @@ pub struct TenantShardMigrateResponse {}
 
 impl AttachmentService {
     pub fn from_env(env: &LocalEnv) -> Self {
-        let path = env.base_data_dir.join("attachments.json");
+        let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
+            .unwrap()
+            .join("attachments.json");
 
         // Makes no sense to construct this if pageservers aren't going to use it: assume
         // pageservers have control plane API set
@@ -181,6 +192,13 @@ impl AttachmentService {
             listen_url.port().unwrap()
         );
 
+        // Convention: NeonEnv in python tests reserves the next port after the control_plane_api
+        // port, for use by our captive postgres.
+        let postgres_port = listen_url
+            .port()
+            .expect("Control plane API setting should always have a port")
+            + 1;
+
         // Assume all pageservers have symmetric auth configuration: this service
         // expects to use one JWT token to talk to all of them.
         let ps_conf = env
@@ -209,6 +227,7 @@ impl AttachmentService {
             listen,
             jwt_token,
             public_key_path,
+            postgres_port,
             client: reqwest::ClientBuilder::new()
                 .build()
                 .expect("Failed to construct http client"),
@@ -220,13 +239,214 @@ impl AttachmentService {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self) -> anyhow::Result<Child> {
-        let path_str = self.path.to_string_lossy();
+    /// PIDFile for the postgres instance used to store attachment service state
+    fn postgres_pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(
+            self.env
+                .base_data_dir
+                .join("attachment_service_postgres.pid"),
+        )
+        .expect("non-Unicode path")
+    }
 
-        let mut args = vec!["-l", &self.listen, "-p", &path_str]
-            .into_iter()
-            .map(|s| s.to_string())
-            .collect::<Vec<_>>();
+    /// In order to access database migrations, we need to find the Neon source tree
+    async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
+        // We assume that either prd or our binary is in the source tree. The former is usually
+        // true for automated test runners, the latter is usually true for developer workstations. Often
+        // both are true, which is fine.
+        let candidate_start_points = [
+            // Current working directory
+            Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
+            // Directory containing the binary we're running inside
+            Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
+        ];
+
+        // For each candidate start point, search through ancestors looking for a neon.git source tree root
+        for start_point in &candidate_start_points {
+            // Start from the build dir: assumes we are running out of a built neon source tree
+            for path in start_point.ancestors() {
+                // A crude approximation: the root of the source tree is whatever contains a "control_plane"
+                // subdirectory.
+                let control_plane = path.join("control_plane");
+                if tokio::fs::try_exists(&control_plane).await? {
+                    return Ok(path.to_owned());
+                }
+            }
+        }
+
+        // Fall-through
+        Err(anyhow::anyhow!(
+            "Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
+        ))
+    }
+
+    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    ///
+    /// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
+    /// to other versions if that one isn't found.  Some automated tests create circumstances
+    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
+
+        for v in prefer_versions {
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            if tokio::fs::try_exists(&path).await? {
+                return Ok(path);
+            }
+        }
+
+        // Fall through
+        anyhow::bail!(
+            "Postgres binaries not found in {}",
+            self.env.pg_distrib_dir.display()
+        );
+    }
+
+    /// Readiness check for our postgres process
+    async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
+        let bin_path = pg_bin_dir.join("pg_isready");
+        let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
+        let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
+
+        Ok(exitcode.success())
+    }
+
+    /// Create our database if it doesn't exist, and run migrations.
+    ///
+    /// This function is equivalent to the `diesel setup` command in the diesel CLI.  We implement
+    /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
+    /// who just want to run `cargo neon_local` without knowing about diesel.
+    ///
+    /// Returns the database url
+    pub async fn setup_database(&self) -> anyhow::Result<String> {
+        let database_url = format!(
+            "postgresql://localhost:{}/attachment_service",
+            self.postgres_port
+        );
+        println!("Running attachment service database setup...");
+        fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
+            let base = ::url::Url::parse(database_url).unwrap();
+            let database = base.path_segments().unwrap().last().unwrap().to_owned();
+            let mut new_url = base.join(default_database).unwrap();
+            new_url.set_query(base.query());
+            (database, new_url.into())
+        }
+
+        #[derive(Debug, Clone)]
+        pub struct CreateDatabaseStatement {
+            db_name: String,
+        }
+
+        impl CreateDatabaseStatement {
+            pub fn new(db_name: &str) -> Self {
+                CreateDatabaseStatement {
+                    db_name: db_name.to_owned(),
+                }
+            }
+        }
+
+        impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
+            fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
+                out.push_sql("CREATE DATABASE ");
+                out.push_identifier(&self.db_name)?;
+                Ok(())
+            }
+        }
+
+        impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
+
+        impl QueryId for CreateDatabaseStatement {
+            type QueryId = ();
+
+            const HAS_STATIC_QUERY_ID: bool = false;
+        }
+        if PgConnection::establish(&database_url).is_err() {
+            let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
+            println!("Creating database: {database}");
+            let mut conn = PgConnection::establish(&postgres_url)?;
+            CreateDatabaseStatement::new(&database).execute(&mut conn)?;
+        }
+        let mut conn = PgConnection::establish(&database_url)?;
+
+        let migrations_dir = self
+            .find_source_root()
+            .await?
+            .join("control_plane/attachment_service/migrations");
+
+        let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
+        println!("Running migrations in {}", migrations.path().display());
+        HarnessWithOutput::write_to_stdout(&mut conn)
+            .run_pending_migrations(migrations)
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        println!("Migrations complete");
+
+        Ok(database_url)
+    }
+
+    pub async fn start(&self) -> anyhow::Result<()> {
+        // Start a vanilla Postgres process used by the attachment service for persistence.
+        let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
+            .unwrap()
+            .join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_log_path = pg_data_path.join("postgres.log");
+
+        if !tokio::fs::try_exists(&pg_data_path).await? {
+            // Initialize empty database
+            let initdb_path = pg_bin_dir.join("initdb");
+            let mut child = Command::new(&initdb_path)
+                .args(["-D", pg_data_path.as_ref()])
+                .spawn()
+                .expect("Failed to spawn initdb");
+            let status = child.wait().await?;
+            if !status.success() {
+                anyhow::bail!("initdb failed with status {status}");
+            }
+
+            tokio::fs::write(
+                &pg_data_path.join("postgresql.conf"),
+                format!("port = {}", self.postgres_port),
+            )
+            .await?;
+        };
+
+        println!("Starting attachment service database...");
+        let db_start_args = [
+            "-w",
+            "-D",
+            pg_data_path.as_ref(),
+            "-l",
+            pg_log_path.as_ref(),
+            "start",
+        ];
+
+        background_process::start_process(
+            "attachment_service_db",
+            &self.env.base_data_dir,
+            pg_bin_dir.join("pg_ctl").as_std_path(),
+            db_start_args,
+            [],
+            background_process::InitialPidFile::Create(self.postgres_pid_file()),
+            || self.pg_isready(&pg_bin_dir),
+        )
+        .await?;
+
+        // Run migrations on every startup, in case something changed.
+        let database_url = self.setup_database().await?;
+
+        let mut args = vec![
+            "-l",
+            &self.listen,
+            "-p",
+            self.path.as_ref(),
+            "--database-url",
+            &database_url,
+        ]
+        .into_iter()
+        .map(|s| s.to_string())
+        .collect::<Vec<_>>();
         if let Some(jwt_token) = &self.jwt_token {
             args.push(format!("--jwt-token={jwt_token}"));
         }
@@ -235,7 +455,7 @@ impl AttachmentService {
             args.push(format!("--public-key={public_key_path}"));
         }
 
-        let result = background_process::start_process(
+        background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
             &self.env.attachment_service_bin(),
@@ -252,29 +472,46 @@ impl AttachmentService {
                 }
             },
         )
-        .await;
+        .await?;
 
-        for ps_conf in &self.env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            self.node_register(NodeRegisterRequest {
-                node_id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-            })
+        Ok(())
+    }
+
+    pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
+
+        let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
+        let pg_bin_dir = self.get_pg_bin_dir().await?;
+
+        println!("Stopping attachment service database...");
+        let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
+        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
+            .args(pg_stop_args)
+            .spawn()?
+            .wait()
             .await?;
+        if !stop_status.success() {
+            let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
+            let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
+                .args(pg_status_args)
+                .spawn()?
+                .wait()
+                .await?;
+
+            // pg_ctl status returns this exit code if postgres is not running: in this case it is
+            // fine that stop failed.  Otherwise it is an error that stop failed.
+            const PG_STATUS_NOT_RUNNING: i32 = 3;
+            if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
+                println!("Attachment service data base is already stopped");
+                return Ok(());
+            } else {
+                anyhow::bail!("Failed to stop attachment service database: {stop_status}")
+            }
         }
 
-        result
+        Ok(())
     }
 
-    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        background_process::stop_process(immediate, COMMAND, &self.pid_file())
-    }
     /// Simple HTTP request wrapper for calling into attachment service
     async fn dispatch<RQ, RS>(
         &self,
@@ -356,7 +593,7 @@ impl AttachmentService {
         &self,
         req: TenantCreateRequest,
     ) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+        self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req))
             .await
     }
 
@@ -413,7 +650,7 @@ impl AttachmentService {
     ) -> anyhow::Result<TimelineInfo> {
         self.dispatch(
             Method::POST,
-            format!("tenant/{tenant_id}/timeline"),
+            format!("v1/tenant/{tenant_id}/timeline"),
             Some(req),
         )
         .await
diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs
index 20fa3af9b8..3ffb8734d0 100644
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -17,7 +17,7 @@ use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;
 use std::{fs, io, thread};
 
@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
     envs: EI,
     initial_pid_file: InitialPidFile,
     process_status_check: F,
-) -> anyhow::Result<Child>
+) -> anyhow::Result<()>
 where
     F: Fn() -> Fut,
     Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -98,7 +98,7 @@ where
         InitialPidFile::Expect(path) => path,
     };
 
-    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+    let spawned_process = filled_cmd.spawn().with_context(|| {
         format!("Could not spawn {process_name}, see console output and log files for details.")
     })?;
     let pid = spawned_process.id();
@@ -106,12 +106,26 @@ where
         i32::try_from(pid)
             .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
     );
+    // set up a scopeguard to kill & wait for the child in case we panic or bail below
+    let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
+        println!("SIGKILL & wait the started process");
+        (|| {
+            // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
+            spawned_process.kill().context("SIGKILL child")?;
+            spawned_process.wait().context("wait() for child process")?;
+            anyhow::Ok(())
+        })()
+        .with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
+        .unwrap();
+    });
 
     for retries in 0..RETRIES {
         match process_started(pid, pid_file_to_check, &process_status_check).await {
             Ok(true) => {
-                println!("\n{process_name} started, pid: {pid}");
-                return Ok(spawned_process);
+                println!("\n{process_name} started and passed status check, pid: {pid}");
+                // leak the child process, it'll outlive this neon_local invocation
+                drop(scopeguard::ScopeGuard::into_inner(spawned_process));
+                return Ok(());
             }
             Ok(false) => {
                 if retries == NOTICE_AFTER_RETRIES {
@@ -126,16 +140,15 @@ where
                 thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
             }
             Err(e) => {
-                println!("{process_name} failed to start: {e:#}");
-                if let Err(e) = spawned_process.kill() {
-                    println!("Could not stop {process_name} subprocess: {e:#}")
-                };
+                println!("error starting process {process_name:?}: {e:#}");
                 return Err(e);
             }
         }
     }
     println!();
-    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
+    anyhow::bail!(
+        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
+    );
 }
 
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 279c47398f..a5242e3dc7 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -135,7 +135,7 @@ fn main() -> Result<()> {
             "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
             "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
             "start" => rt.block_on(handle_start_all(sub_args, &env)),
-            "stop" => handle_stop_all(sub_args, &env),
+            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
             "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
             "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
             "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
@@ -1056,8 +1056,9 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     match sub_match.subcommand() {
         Some(("start", subcommand_args)) => {
+            let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
             if let Err(e) = get_pageserver(env, subcommand_args)?
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), *register)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1086,24 +1087,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
             }
 
             if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
-                .await
-            {
-                eprintln!("pageserver start failed: {e}");
-                exit(1);
-            }
-        }
-
-        Some(("migrate", subcommand_args)) => {
-            let pageserver = get_pageserver(env, subcommand_args)?;
-            //TODO what shutdown strategy should we use here?
-            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
-                exit(1);
-            }
-
-            if let Err(e) = pageserver
-                .start(&pageserver_config_overrides(subcommand_args))
+                .start(&pageserver_config_overrides(subcommand_args), false)
                 .await
             {
                 eprintln!("pageserver start failed: {e}");
@@ -1161,7 +1145,7 @@ async fn handle_attachment_service(
                 .map(|s| s.as_str())
                 == Some("immediate");
 
-            if let Err(e) = svc.stop(immediate) {
+            if let Err(e) = svc.stop(immediate).await {
                 eprintln!("stop failed: {}", e);
                 exit(1);
             }
@@ -1257,7 +1241,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let attachment_service = AttachmentService::from_env(env);
         if let Err(e) = attachment_service.start().await {
             eprintln!("attachment_service start failed: {:#}", e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1265,11 +1249,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver
-            .start(&pageserver_config_overrides(sub_match))
+            .start(&pageserver_config_overrides(sub_match), true)
             .await
         {
             eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
-            try_stop_all(env, true);
+            try_stop_all(env, true).await;
             exit(1);
         }
     }
@@ -1278,23 +1262,23 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
         let safekeeper = SafekeeperNode::from_env(env, node);
         if let Err(e) = safekeeper.start(vec![]).await {
             eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
-            try_stop_all(env, false);
+            try_stop_all(env, false).await;
             exit(1);
         }
     }
     Ok(())
 }
 
-fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
     let immediate =
         sub_match.get_one::<String>("stop-mode").map(|s| s.as_str()) == Some("immediate");
 
-    try_stop_all(env, immediate);
+    try_stop_all(env, immediate).await;
 
     Ok(())
 }
 
-fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
     // Stop all endpoints
     match ComputeControlPlane::load(env.clone()) {
         Ok(cplane) => {
@@ -1329,7 +1313,7 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
 
     if env.control_plane_api.is_some() {
         let attachment_service = AttachmentService::from_env(env);
-        if let Err(e) = attachment_service.stop(immediate) {
+        if let Err(e) = attachment_service.stop(immediate).await {
             eprintln!("attachment service stop failed: {e:#}");
         }
     }
@@ -1549,7 +1533,11 @@ fn cli() -> Command {
                 .subcommand(Command::new("status"))
                 .subcommand(Command::new("start")
                     .about("Start local pageserver")
-                    .arg(pageserver_config_args.clone())
+                    .arg(pageserver_config_args.clone()).arg(Arg::new("register")
+                    .long("register")
+                    .default_value("true").required(false)
+                    .value_parser(value_parser!(bool))
+                    .value_name("register"))
                 )
                 .subcommand(Command::new("stop")
                     .about("Stop local pageserver")
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 43f8ea3b43..dcad22b992 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
 
 use compute_api::responses::{ComputeState, ComputeStatus};
-use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
+use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
 
 // contents of a endpoint.json file
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -70,6 +70,7 @@ pub struct EndpointConf {
     http_port: u16,
     pg_version: u32,
     skip_pg_catalog_updates: bool,
+    features: Vec<ComputeFeature>,
 }
 
 //
@@ -140,6 +141,7 @@ impl ComputeControlPlane {
             // with this we basically test a case of waking up an idle compute, where
             // we also skip catalog updates in the cloud.
             skip_pg_catalog_updates: true,
+            features: vec![],
         });
 
         ep.create_endpoint_dir()?;
@@ -154,6 +156,7 @@ impl ComputeControlPlane {
                 pg_port,
                 pg_version,
                 skip_pg_catalog_updates: true,
+                features: vec![],
             })?,
         )?;
         std::fs::write(
@@ -215,6 +218,9 @@ pub struct Endpoint {
 
     // Optimizations
     skip_pg_catalog_updates: bool,
+
+    // Feature flags
+    features: Vec<ComputeFeature>,
 }
 
 impl Endpoint {
@@ -244,6 +250,7 @@ impl Endpoint {
             tenant_id: conf.tenant_id,
             pg_version: conf.pg_version,
             skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            features: conf.features,
         })
     }
 
@@ -431,7 +438,7 @@ impl Endpoint {
     }
 
     fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
-        // TODO use background_process::stop_process instead
+        // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
         let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -519,7 +526,7 @@ impl Endpoint {
             skip_pg_catalog_updates: self.skip_pg_catalog_updates,
             format_version: 1.0,
             operation_uuid: None,
-            features: vec![],
+            features: self.features.clone(),
             cluster: Cluster {
                 cluster_id: None, // project ID: not used
                 name: None,       // project name: not used
@@ -576,9 +583,21 @@ impl Endpoint {
         }
 
         let child = cmd.spawn()?;
+        // set up a scopeguard to kill & wait for the child in case we panic or bail below
+        let child = scopeguard::guard(child, |mut child| {
+            println!("SIGKILL & wait the started process");
+            (|| {
+                // TODO: use another signal that can be caught by the child so it can clean up any children it spawned
+                child.kill().context("SIGKILL child")?;
+                child.wait().context("wait() for child process")?;
+                anyhow::Ok(())
+            })()
+            .with_context(|| format!("scopeguard kill&wait child {child:?}"))
+            .unwrap();
+        });
 
         // Write down the pid so we can wait for it when we want to stop
-        // TODO use background_process::start_process instead
+        // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
         let pid = child.id();
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         std::fs::write(pidfile_path, pid.to_string())?;
@@ -627,6 +646,9 @@ impl Endpoint {
             std::thread::sleep(ATTEMPT_INTERVAL);
         }
 
+        // disarm the scopeguard, let the child outlive this function (and neon_local invoction)
+        drop(scopeguard::ScopeGuard::into_inner(child));
+
         Ok(())
     }
 
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 4460fdd3a6..aefef47da7 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -223,7 +223,11 @@ impl LocalEnv {
     }
 
     pub fn attachment_service_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("attachment_service")
+        // Irrespective of configuration, attachment service binary is always
+        // run from the same location as neon_local.  This means that for compatibility
+        // tests that run old pageserver/safekeeper, they still run latest attachment service.
+        let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
+        neon_local_bin_dir.join("attachment_service")
     }
 
     pub fn safekeeper_bin(&self) -> PathBuf {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 18ccf6bd98..540d1185a2 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,7 +11,7 @@ use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;
 
 use anyhow::{bail, Context};
@@ -30,6 +30,7 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::attachment_service::{AttachmentService, NodeRegisterRequest};
 use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};
 
@@ -161,8 +162,8 @@ impl PageServerNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
-        self.start_node(config_overrides, false).await
+    pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false, register).await
     }
 
     fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
@@ -207,7 +208,8 @@ impl PageServerNode {
         &self,
         config_overrides: &[&str],
         update_config: bool,
-    ) -> anyhow::Result<Child> {
+        register: bool,
+    ) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
         print!(
@@ -244,7 +246,26 @@ impl PageServerNode {
                 }
             },
         )
-        .await
+        .await?;
+
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
+        Ok(())
     }
 
     fn pageserver_basic_args<'a>(
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 4026ef0eb9..6ac71dfe51 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,7 +7,6 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Child;
 use std::{io, result};
 
 use anyhow::Context;
@@ -104,7 +103,7 @@ impl SafekeeperNode {
             .expect("non-Unicode path")
     }
 
-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
         print!(
             "Starting safekeeper at '{}' in '{}'",
             self.pg_connection_config.raw_address(),
diff --git a/diesel.toml b/diesel.toml
new file mode 100644
index 0000000000..30ed4444d7
--- /dev/null
+++ b/diesel.toml
@@ -0,0 +1,9 @@
+# For documentation on how to configure this file,
+# see https://diesel.rs/guides/configuring-diesel-cli
+
+[print_schema]
+file = "control_plane/attachment_service/src/schema.rs"
+custom_type_derives = ["diesel::query_builder::QueryId"]
+
+[migrations_directory]
+dir = "control_plane/attachment_service/migrations"
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 13ac18e0c5..5361d14004 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
+    /// Enable running migrations
+    Migrations,
+
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 6a3679292e..852670af2c 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,9 +1,11 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::{fmt, ops::Range};
 
-use crate::reltag::{BlockNumber, RelTag};
+use crate::reltag::{BlockNumber, RelTag, SlruKind};
 
 /// Key used in the Repository kv-store.
 ///
@@ -143,12 +145,390 @@ impl Key {
     }
 }
 
+// Layout of the Key address space
+//
+// The Key struct, used to address the underlying key-value store, consists of
+// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
+// all the data and metadata keys into those 18 bytes.
+//
+// Principles for the mapping:
+//
+// - Things that are often accessed or modified together, should be close to
+//   each other in the key space. For example, if a relation is extended by one
+//   block, we create a new key-value pair for the block data, and update the
+//   relation size entry. Because of that, the RelSize key comes after all the
+//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
+//   to each other.
+//
+// The key space is divided into four major sections, identified by the first
+// byte, and the form a hierarchy:
+//
+// 00 Relation data and metadata
+//
+//   DbDir    () -> (dbnode, spcnode)
+//   Filenodemap
+//   RelDir   -> relnode forknum
+//       RelBlocks
+//       RelSize
+//
+// 01 SLRUs
+//
+//   SlruDir  kind
+//   SlruSegBlocks segno
+//   SlruSegSize
+//
+// 02 pg_twophase
+//
+// 03 misc
+//    Controlfile
+//    checkpoint
+//    pg_version
+//
+// 04 aux files
+//
+// Below is a full list of the keyspace allocation:
+//
+// DbDir:
+// 00 00000000 00000000 00000000 00   00000000
+//
+// Filenodemap:
+// 00 SPCNODE  DBNODE   00000000 00   00000000
+//
+// RelDir:
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
+//
+// RelBlock:
+// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
+//
+// RelSize:
+// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+//
+// SlruDir:
+// 01 kind     00000000 00000000 00   00000000
+//
+// SlruSegBlock:
+// 01 kind     00000001 SEGNO    00   BLKNUM
+//
+// SlruSegSize:
+// 01 kind     00000001 SEGNO    00   FFFFFFFF
+//
+// TwoPhaseDir:
+// 02 00000000 00000000 00000000 00   00000000
+//
+// TwoPhaseFile:
+// 02 00000000 00000000 00000000 00   XID
+//
+// ControlFile:
+// 03 00000000 00000000 00000000 00   00000000
+//
+// Checkpoint:
+// 03 00000000 00000000 00000000 00   00000001
+//
+// AuxFiles:
+// 03 00000000 00000000 00000000 00   00000002
+//
+
+//-- Section 01: relation data and metadata
+
+pub const DBDIR_KEY: Key = Key {
+    field1: 0x00,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0xffffffff,
+        field5: 0xff,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 1,
+    }
+}
+
+#[inline(always)]
+pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn rel_size_to_key(rel: RelTag) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn rel_key_range(rel: RelTag) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum + 1,
+        field6: 0,
+    }
+}
+
+//-- Section 02: SLRUs
+
+#[inline(always)]
+pub fn slru_dir_to_key(kind: SlruKind) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
+    let field2 = match kind {
+        SlruKind::Clog => 0x00,
+        SlruKind::MultiXactMembers => 0x01,
+        SlruKind::MultiXactOffsets => 0x02,
+    };
+
+    Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 1,
+        field6: 0,
+    }
+}
+
+//-- Section 03: pg_twophase
+
+pub const TWOPHASEDIR_KEY: Key = Key {
+    field1: 0x02,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn twophase_file_key(xid: TransactionId) -> Key {
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }
+}
+
+#[inline(always)]
+pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
+    let (next_xid, overflowed) = xid.overflowing_add(1);
+
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }..Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: u8::from(overflowed),
+        field6: next_xid,
+    }
+}
+
+//-- Section 03: Control file
+pub const CONTROLFILE_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+pub const CHECKPOINT_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 1,
+};
+
+pub const AUX_FILES_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 2,
+};
+
+// Reverse mappings for a few Keys.
+// These are needed by WAL redo manager.
+
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+#[inline(always)]
+pub fn is_inherited_key(key: Key) -> bool {
+    key != AUX_FILES_KEY
+}
+
+#[inline(always)]
+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+    Ok(match key.field1 {
+        0x01 => {
+            let kind = match key.field2 {
+                0x00 => SlruKind::Clog,
+                0x01 => SlruKind::MultiXactMembers,
+                0x02 => SlruKind::MultiXactOffsets,
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
+            };
+            let segno = key.field4;
+            let blknum = key.field6;
+
+            (kind, segno, blknum)
+        }
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}
+
+#[inline(always)]
+pub fn is_slru_block_key(key: Key) -> bool {
+    key.field1 == 0x01                // SLRU-related
+        && key.field3 == 0x00000001   // but not SlruDir
+        && key.field6 != 0xffffffff // and not SlruSegSize
+}
+
 #[inline(always)]
 pub fn is_rel_block_key(key: &Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }
 
 /// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+#[inline(always)]
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index cab7b3d860..2316acb616 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -104,6 +104,7 @@ pub struct KeySpaceAccum {
     accum: Option<Range<Key>>,
 
     ranges: Vec<Range<Key>>,
+    size: u64,
 }
 
 impl KeySpaceAccum {
@@ -111,6 +112,7 @@ impl KeySpaceAccum {
         Self {
             accum: None,
             ranges: Vec::new(),
+            size: 0,
         }
     }
 
@@ -121,6 +123,8 @@ impl KeySpaceAccum {
 
     #[inline(always)]
     pub fn add_range(&mut self, range: Range<Key>) {
+        self.size += key_range_size(&range) as u64;
+
         match self.accum.as_mut() {
             Some(accum) => {
                 if range.start == accum.end {
@@ -146,6 +150,23 @@ impl KeySpaceAccum {
             ranges: self.ranges,
         }
     }
+
+    pub fn consume_keyspace(&mut self) -> KeySpace {
+        if let Some(accum) = self.accum.take() {
+            self.ranges.push(accum);
+        }
+
+        let mut prev_accum = KeySpaceAccum::new();
+        std::mem::swap(self, &mut prev_accum);
+
+        KeySpace {
+            ranges: prev_accum.ranges,
+        }
+    }
+
+    pub fn size(&self) -> u64 {
+        self.size
+    }
 }
 
 ///
@@ -254,6 +275,30 @@ mod tests {
         }
     }
 
+    #[test]
+    fn keyspace_consume() {
+        let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
+
+        let mut accum = KeySpaceAccum::new();
+        for range in &ranges {
+            accum.add_range(range.clone());
+        }
+
+        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
+        assert_eq!(accum.size(), expected_size);
+
+        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
+        assert_eq!(accum.size(), 0);
+
+        assert_ks_eq(&accum.consume_keyspace(), vec![]);
+        assert_eq!(accum.size(), 0);
+
+        for range in &ranges {
+            accum.add_range(range.clone());
+        }
+        assert_ks_eq(&accum.to_keyspace(), ranges);
+    }
+
     #[test]
     fn keyspace_add_range() {
         // two separate ranges
diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index e3a7da2ad9..3f37af600d 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -111,7 +111,19 @@ impl RelTag {
 /// These files are divided into segments, which are divided into
 /// pages of the same BLCKSZ as used for relation files.
 ///
-#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    Hash,
+    Serialize,
+    Deserialize,
+    PartialEq,
+    Eq,
+    PartialOrd,
+    Ord,
+    strum_macros::EnumIter,
+)]
 pub enum SlruKind {
     Clog,
     MultiXactMembers,
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 7895a21f66..abab32470b 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,6 +8,7 @@ use std::pin::Pin;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;
 
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
@@ -23,6 +24,7 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::{StatusCode, Url};
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
 use crate::s3_bucket::RequestKind;
@@ -183,7 +185,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
     }
 }
 
-#[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
     async fn list(
         &self,
@@ -371,6 +372,20 @@ impl RemoteStorage for AzureBlobStorage {
             copy_status = status;
         }
     }
+
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        // TODO use Azure point in time recovery feature for this
+        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
+        Err(anyhow::anyhow!(
+            "time travel recovery for azure blob storage is not implemented"
+        ))
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 942d0016b0..bf9c51ad1a 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -25,6 +25,7 @@ use bytes::Bytes;
 use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;
 use toml_edit::Item;
 use tracing::info;
 
@@ -142,7 +143,7 @@ pub struct Listing {
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
-#[async_trait::async_trait]
+#[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
     /// Lists all top level subdirectories for a given prefix
     /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static {
 
     /// Copy a remote object inside a bucket from one path to another.
     async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
+
+    /// Resets the content of everything with the given prefix to the given state
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()>;
 }
 
 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -262,14 +272,15 @@ impl std::error::Error for DownloadError {}
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
-pub enum GenericRemoteStorage {
+// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
+pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
     LocalFs(LocalFs),
     AwsS3(Arc<S3Bucket>),
     AzureBlob(Arc<AzureBlobStorage>),
-    Unreliable(Arc<UnreliableWrapper>),
+    Unreliable(Other),
 }
 
-impl GenericRemoteStorage {
+impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
     pub async fn list(
         &self,
         prefix: Option<&RemotePath>,
@@ -386,6 +397,33 @@ impl GenericRemoteStorage {
             Self::Unreliable(s) => s.copy(from, to).await,
         }
     }
+
+    pub async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AwsS3(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AzureBlob(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::Unreliable(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -673,6 +711,7 @@ impl ConcurrencyLimiter {
             RequestKind::List => &self.read,
             RequestKind::Delete => &self.write,
             RequestKind::Copy => &self.write,
+            RequestKind::TimeTravel => &self.write,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index bf8b6b5dde..34a6658a69 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,7 +4,7 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.
 
-use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
 
 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -14,7 +14,7 @@ use tokio::{
     fs,
     io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
-use tokio_util::io::ReaderStream;
+use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
@@ -157,7 +157,6 @@ impl LocalFs {
     }
 }
 
-#[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
     async fn list(
         &self,
@@ -423,6 +422,17 @@ impl RemoteStorage for LocalFs {
         })?;
         Ok(())
     }
+
+    #[allow(clippy::diverging_sub_expression)]
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        unimplemented!()
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index d7b41edaaf..4909b8522b 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -6,12 +6,14 @@
 
 use std::{
     borrow::Cow,
+    collections::HashMap,
     pin::Pin,
     sync::Arc,
     task::{Context, Poll},
+    time::SystemTime,
 };
 
-use anyhow::Context as _;
+use anyhow::{anyhow, Context as _};
 use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
     imds::credentials::ImdsCredentialsProvider,
@@ -27,17 +29,19 @@ use aws_sdk_s3::{
     config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
-    types::{Delete, ObjectIdentifier},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
     Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 
-use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
+use aws_smithy_types::{body::SdkBody, DateTime};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
+use utils::backoff;
 
 use super::StorageMetadata;
 use crate::{
@@ -270,6 +274,59 @@ impl S3Bucket {
             }
         }
     }
+
+    async fn delete_oids(
+        &self,
+        kind: RequestKind,
+        delete_objects: &[ObjectIdentifier],
+    ) -> anyhow::Result<()> {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+            let started_at = start_measuring_requests(kind);
+
+            let resp = self
+                .client
+                .delete_objects()
+                .bucket(self.bucket_name.clone())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
+                .send()
+                .await;
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
+            let resp = resp?;
+            metrics::BUCKET_METRICS
+                .deleted_objects_total
+                .inc_by(chunk.len() as u64);
+            if let Some(errors) = resp.errors {
+                // Log a bounded number of the errors within the response:
+                // these requests can carry 1000 keys so logging each one
+                // would be too verbose, especially as errors may lead us
+                // to retry repeatedly.
+                const LOG_UP_TO_N_ERRORS: usize = 10;
+                for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                    tracing::warn!(
+                        "DeleteObjects key {} failed: {}: {}",
+                        e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                    );
+                }
+
+                return Err(anyhow::format_err!(
+                    "Failed to delete {} objects",
+                    errors.len()
+                ));
+            }
+        }
+        Ok(())
+    }
 }
 
 pin_project_lite::pin_project! {
@@ -373,7 +430,6 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
     }
 }
 
-#[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     async fn list(
         &self,
@@ -569,64 +625,168 @@ impl RemoteStorage for S3Bucket {
             delete_objects.push(obj_id);
         }
 
-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
-            let started_at = start_measuring_requests(kind);
-
-            let resp = self
-                .client
-                .delete_objects()
-                .bucket(self.bucket_name.clone())
-                .delete(
-                    Delete::builder()
-                        .set_objects(Some(chunk.to_vec()))
-                        .build()?,
-                )
-                .send()
-                .await;
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
-            match resp {
-                Ok(resp) => {
-                    metrics::BUCKET_METRICS
-                        .deleted_objects_total
-                        .inc_by(chunk.len() as u64);
-                    if let Some(errors) = resp.errors {
-                        // Log a bounded number of the errors within the response:
-                        // these requests can carry 1000 keys so logging each one
-                        // would be too verbose, especially as errors may lead us
-                        // to retry repeatedly.
-                        const LOG_UP_TO_N_ERRORS: usize = 10;
-                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
-                            tracing::warn!(
-                                "DeleteObjects key {} failed: {}: {}",
-                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
-                            );
-                        }
-
-                        return Err(anyhow::format_err!(
-                            "Failed to delete {} objects",
-                            errors.len()
-                        ));
-                    }
-                }
-                Err(e) => {
-                    return Err(e.into());
-                }
-            }
-        }
-        Ok(())
+        self.delete_oids(kind, &delete_objects).await
     }
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
         let paths = std::array::from_ref(path);
         self.delete_objects(paths).await
     }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        let kind = RequestKind::TimeTravel;
+        let _guard = self.permit(kind).await;
+
+        let timestamp = DateTime::from(timestamp);
+        let done_if_after = DateTime::from(done_if_after);
+
+        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let prefix = prefix
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |_e: &_| false;
+
+        let list = backoff::retry(
+            || async {
+                Ok(self
+                    .client
+                    .list_object_versions()
+                    .bucket(self.bucket_name.clone())
+                    .set_prefix(prefix.clone())
+                    .send()
+                    .await?)
+            },
+            is_permanent,
+            warn_threshold,
+            max_retries,
+            "listing object versions for time_travel_recover",
+            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+        )
+        .await?;
+
+        if list.is_truncated().unwrap_or_default() {
+            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
+        }
+
+        let mut versions_deletes = list
+            .versions()
+            .iter()
+            .map(VerOrDelete::Version)
+            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
+            .collect::<Vec<_>>();
+
+        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+
+        let mut vds_for_key = HashMap::<_, Vec<_>>::new();
+
+        for vd in versions_deletes {
+            let last_modified = vd.last_modified();
+            let version_id = vd.version_id();
+            let key = vd.key();
+            let (Some(last_modified), Some(version_id), Some(key)) =
+                (last_modified, version_id, key)
+            else {
+                anyhow::bail!(
+                    "One (or more) of last_modified, key, and id is None. \
+                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
+                    last_modified, key, version_id,
+                );
+            };
+            if version_id == "null" {
+                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values");
+            }
+            tracing::trace!(
+                "Parsing version key={key} version_id={version_id} is_delete={}",
+                matches!(vd, VerOrDelete::DeleteMarker(_))
+            );
+
+            vds_for_key
+                .entry(key)
+                .or_default()
+                .push((vd, last_modified, version_id));
+        }
+        for (key, versions) in vds_for_key {
+            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
+            if last_last_modified > &&done_if_after {
+                tracing::trace!("Key {key} has version later than done_if_after, skipping");
+                continue;
+            }
+            // the version we want to restore to.
+            let version_to_restore_to =
+                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                    Ok(v) => v,
+                    Err(e) => e,
+                };
+            if version_to_restore_to == versions.len() {
+                tracing::trace!("Key {key} has no changes since timestamp, skipping");
+                continue;
+            }
+            let mut do_delete = false;
+            if version_to_restore_to == 0 {
+                // All versions more recent, so the key didn't exist at the specified time point.
+                tracing::trace!(
+                    "All {} versions more recent for {key}, deleting",
+                    versions.len()
+                );
+                do_delete = true;
+            } else {
+                match &versions[version_to_restore_to - 1] {
+                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                        tracing::trace!("Copying old version {version_id} for {key}...");
+                        // Restore the state to the last version by copying
+                        let source_id =
+                            format!("{}/{key}?versionId={version_id}", self.bucket_name);
+
+                        backoff::retry(
+                            || async {
+                                Ok(self
+                                    .client
+                                    .copy_object()
+                                    .bucket(self.bucket_name.clone())
+                                    .key(key)
+                                    .copy_source(&source_id)
+                                    .send()
+                                    .await?)
+                            },
+                            is_permanent,
+                            warn_threshold,
+                            max_retries,
+                            "listing object versions for time_travel_recover",
+                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                        )
+                        .await?;
+                    }
+                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                        do_delete = true;
+                    }
+                }
+            };
+            if do_delete {
+                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                    // Key has since been deleted (but there was some history), no need to do anything
+                    tracing::trace!("Key {key} already deleted, skipping.");
+                } else {
+                    tracing::trace!("Deleting {key}...");
+
+                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
+                    self.delete_oids(kind, &[oid]).await?;
+                }
+            }
+        }
+        Ok(())
+    }
 }
 
 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -651,6 +811,32 @@ fn start_measuring_requests(
     })
 }
 
+enum VerOrDelete<'a> {
+    Version(&'a ObjectVersion),
+    DeleteMarker(&'a DeleteMarkerEntry),
+}
+
+impl<'a> VerOrDelete<'a> {
+    fn last_modified(&self) -> Option<&'a DateTime> {
+        match self {
+            VerOrDelete::Version(v) => v.last_modified(),
+            VerOrDelete::DeleteMarker(v) => v.last_modified(),
+        }
+    }
+    fn version_id(&self) -> Option<&'a str> {
+        match self {
+            VerOrDelete::Version(v) => v.version_id(),
+            VerOrDelete::DeleteMarker(v) => v.version_id(),
+        }
+    }
+    fn key(&self) -> Option<&'a str> {
+        match self {
+            VerOrDelete::Version(v) => v.key(),
+            VerOrDelete::DeleteMarker(v) => v.key(),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use camino::Utf8Path;
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs
index 21dde14906..beca755920 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -12,6 +12,7 @@ pub(crate) enum RequestKind {
     Delete = 2,
     List = 3,
     Copy = 4,
+    TimeTravel = 5,
 }
 
 use RequestKind::*;
@@ -24,6 +25,7 @@ impl RequestKind {
             Delete => "delete_object",
             List => "list_objects",
             Copy => "copy_object",
+            TimeTravel => "time_travel_recover",
         }
     }
     const fn as_index(&self) -> usize {
@@ -31,7 +33,7 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 5]);
+pub(super) struct RequestTyped<C>([C; 6]);
 
 impl<C> RequestTyped<C> {
     pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -40,8 +42,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy].into_iter();
-        let arr = std::array::from_fn::<C, 5, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
+        let arr = std::array::from_fn::<C, 6, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 7f5adcea30..fc4c4b315b 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,16 +3,19 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
+use std::time::SystemTime;
+use std::{collections::hash_map::Entry, sync::Arc};
+use tokio_util::sync::CancellationToken;
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
+    StorageMetadata,
 };
 
 pub struct UnreliableWrapper {
-    inner: crate::GenericRemoteStorage,
+    inner: GenericRemoteStorage<Arc<VoidStorage>>,
 
     // This many attempts of each operation will fail, then we let it succeed.
     attempts_to_fail: u64,
@@ -29,11 +32,21 @@ enum RemoteOp {
     Download(RemotePath),
     Delete(RemotePath),
     DeleteObjects(Vec<RemotePath>),
+    TimeTravelRecover(Option<RemotePath>),
 }
 
 impl UnreliableWrapper {
     pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
         assert!(attempts_to_fail > 0);
+        let inner = match inner {
+            GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
+            GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
+            GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
+            // We could also make this a no-op, as in, extract the inner of the passed generic remote storage
+            GenericRemoteStorage::Unreliable(_s) => {
+                panic!("Can't wrap unreliable wrapper unreliably")
+            }
+        };
         UnreliableWrapper {
             inner,
             attempts_to_fail,
@@ -84,7 +97,9 @@ impl UnreliableWrapper {
     }
 }
 
-#[async_trait::async_trait]
+// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
+type VoidStorage = crate::LocalFs;
+
 impl RemoteStorage for UnreliableWrapper {
     async fn list_prefixes(
         &self,
@@ -169,4 +184,17 @@ impl RemoteStorage for UnreliableWrapper {
         self.attempt(RemoteOp::Upload(to.clone()))?;
         self.inner.copy_object(from, to).await
     }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
+        self.inner
+            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
+            .await
+    }
 }
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 4a999d115e..679be66bf7 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,15 +1,21 @@
-use std::collections::HashSet;
 use std::env;
+use std::fmt::{Debug, Display};
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::UNIX_EPOCH;
+use std::time::{Duration, UNIX_EPOCH};
+use std::{collections::HashSet, time::SystemTime};
 
+use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
+use camino::Utf8Path;
+use futures_util::Future;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
+use test_context::test_context;
 use test_context::AsyncTestContext;
+use tokio_util::sync::CancellationToken;
 use tracing::info;
 
 mod common;
@@ -18,11 +24,160 @@ mod common;
 mod tests_s3;
 
 use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};
+use utils::backoff;
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
 const BASE_PREFIX: &str = "test";
 
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+    // Our test depends on discrepancies in the clock between S3 and the environment the tests
+    // run in. Therefore, wait a little bit before and after. The alternative would be
+    // to take the time from S3 response headers.
+    const WAIT_TIME: Duration = Duration::from_millis(3_000);
+
+    async fn retry<T, O, F, E>(op: O) -> Result<T, E>
+    where
+        E: Display + Debug + 'static,
+        O: FnMut() -> F,
+        F: Future<Output = Result<T, E>>,
+    {
+        let warn_threshold = 3;
+        let max_retries = 10;
+        backoff::retry(
+            op,
+            |_e| false,
+            warn_threshold,
+            max_retries,
+            "test retry",
+            backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+        )
+        .await
+    }
+
+    async fn time_point() -> SystemTime {
+        tokio::time::sleep(WAIT_TIME).await;
+        let ret = SystemTime::now();
+        tokio::time::sleep(WAIT_TIME).await;
+        ret
+    }
+
+    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
+        Ok(retry(|| client.list_files(None))
+            .await
+            .context("list root files failure")?
+            .into_iter()
+            .collect::<HashSet<_>>())
+    }
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+        ctx.client.upload(data, len, &path1, None)
+    })
+    .await?;
+
+    let t0_files = list_files(&ctx.client).await?;
+    let t0 = time_point().await;
+    println!("at t0: {t0_files:?}");
+
+    let old_data = "remote blob data2";
+
+    retry(|| {
+        let (data, len) = upload_stream(old_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
+
+    let t1_files = list_files(&ctx.client).await?;
+    let t1 = time_point().await;
+    println!("at t1: {t1_files:?}");
+
+    // A little check to ensure that our clock is not too far off from the S3 clock
+    {
+        let dl = retry(|| ctx.client.download(&path2)).await?;
+        let last_modified = dl.last_modified.unwrap();
+        let half_wt = WAIT_TIME.mul_f32(0.5);
+        let t0_hwt = t0 + half_wt;
+        let t1_hwt = t1 - half_wt;
+        if !(t0_hwt..=t1_hwt).contains(&last_modified) {
+            panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
+                This likely means a large lock discrepancy between S3 and the local clock.");
+        }
+    }
+
+    retry(|| {
+        let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+        ctx.client.upload(data, len, &path3, None)
+    })
+    .await?;
+
+    let new_data = "new remote blob data2";
+
+    retry(|| {
+        let (data, len) = upload_stream(new_data.as_bytes().into());
+        ctx.client.upload(data, len, &path2, None)
+    })
+    .await?;
+
+    retry(|| ctx.client.delete(&path1)).await?;
+    let t2_files = list_files(&ctx.client).await?;
+    let t2 = time_point().await;
+    println!("at t2: {t2_files:?}");
+
+    // No changes after recovery to t2 (no-op)
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t2, t_final, CancellationToken::new())
+        .await?;
+    let t2_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t2: {t2_files_recovered:?}");
+    assert_eq!(t2_files, t2_files_recovered);
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t2, new_data.as_bytes());
+
+    // after recovery to t1: path1 is back, path2 has the old content
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t1, t_final, CancellationToken::new())
+        .await?;
+    let t1_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t1: {t1_files_recovered:?}");
+    assert_eq!(t1_files, t1_files_recovered);
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t1, old_data.as_bytes());
+
+    // after recovery to t0: everything is gone except for path1
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t0, t_final, CancellationToken::new())
+        .await?;
+    let t0_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t0: {t0_files_recovered:?}");
+    assert_eq!(t0_files, t0_files_recovered);
+
+    // cleanup
+
+    let paths = &[path1, path2, path3];
+    retry(|| ctx.client.delete_objects(paths)).await?;
+
+    Ok(())
+}
+
 struct EnabledS3 {
     client: Arc<GenericRemoteStorage>,
     base_prefix: &'static str,
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 3e9281ac81..d55823b0b7 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -131,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
         ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
         ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
         ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
-        _ => error!("Error processing HTTP request: {api_error:#}"),
+        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
+        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+        _ => info!("Error processing HTTP request: {api_error:#}"),
     }
 
     api_error.into_response()
diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs
index 8b1fd71ae6..05e2e3af4c 100644
--- a/libs/utils/src/nonblock.rs
+++ b/libs/utils/src/nonblock.rs
@@ -5,10 +5,10 @@ use std::os::unix::io::RawFd;
 pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
     let bits = fcntl(fd, F_GETFL)?;
 
-    // Safety: If F_GETFL returns some unknown bits, they should be valid
+    // If F_GETFL returns some unknown bits, they should be valid
     // for passing back to F_SETFL, too. If we left them out, the F_SETFL
     // would effectively clear them, which is not what we want.
-    let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
+    let mut flags = OFlag::from_bits_retain(bits);
     flags |= OFlag::O_NONBLOCK;
 
     fcntl(fd, F_SETFL(flags))?;
diff --git a/libs/utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs
index 7666ad138c..6b35d3d63a 100644
--- a/libs/utils/src/tcp_listener.rs
+++ b/libs/utils/src/tcp_listener.rs
@@ -1,7 +1,6 @@
 use std::{
     io,
     net::{TcpListener, ToSocketAddrs},
-    os::unix::prelude::AsRawFd,
 };
 
 use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
@@ -10,7 +9,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
 pub fn bind<A: ToSocketAddrs>(addr: A) -> io::Result<TcpListener> {
     let listener = TcpListener::bind(addr)?;
 
-    setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?;
+    setsockopt(&listener, ReuseAddr, &true)?;
 
     Ok(listener)
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 980fbab22e..e44501d1ed 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -61,6 +61,7 @@ sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 15d4eb09e0..eb5c3f15cf 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -18,7 +18,7 @@ use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
-use pageserver::virtual_file::VirtualFile;
+use pageserver::virtual_file::{self, VirtualFile};
 
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
@@ -142,7 +142,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10);
+    pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index ebf4a4bec3..dbbcfedac0 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     page_cache::init(100);
     let file = FileBlockReader::new(VirtualFile::open(path).await?);
     let summary_blk = file.read_blk(0, ctx).await?;
@@ -187,7 +187,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10);
+            pageserver::virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index fb42d6d2f1..3c90933fe9 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -123,7 +123,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
+    virtual_file::init(10, virtual_file::IoEngineKind::StdFs);
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 98f1852acd..400b5476b7 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -423,8 +423,8 @@ async fn client(
     tokio::select! {
         res = do_requests => { res },
         _ = cancel.cancelled() => {
-            client.shutdown().await;
-            return;
+            // fallthrough to shutdown
         }
     }
+    client.shutdown().await;
 }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 7e5ae892ad..009deff0aa 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -11,8 +11,9 @@
 //! from data stored in object storage.
 //!
 use anyhow::{anyhow, bail, ensure, Context};
-use bytes::{BufMut, BytesMut};
+use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -133,6 +134,87 @@ where
     ctx: &'a RequestContext,
 }
 
+/// A sink that accepts SLRU blocks ordered by key and forwards
+/// full segments to the archive.
+struct SlruSegmentsBuilder<'a, 'b, W>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    ar: &'a mut Builder<&'b mut W>,
+    buf: Vec<u8>,
+    current_segment: Option<(SlruKind, u32)>,
+}
+
+impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
+        Self {
+            ar,
+            buf: Vec::new(),
+            current_segment: None,
+        }
+    }
+
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+        let (kind, segno, _) = key_to_slru_block(*key)?;
+
+        match kind {
+            SlruKind::Clog => {
+                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+            }
+            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
+                ensure!(block.len() == BLCKSZ as usize);
+            }
+        }
+
+        let segment = (kind, segno);
+        match self.current_segment {
+            None => {
+                self.current_segment = Some(segment);
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+            Some(current_seg) if current_seg == segment => {
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+            Some(_) => {
+                self.flush().await?;
+
+                self.current_segment = Some(segment);
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn flush(&mut self) -> anyhow::Result<()> {
+        let nblocks = self.buf.len() / BLCKSZ as usize;
+        let (kind, segno) = self.current_segment.take().unwrap();
+        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
+        let header = new_tar_header(&segname, self.buf.len() as u64)?;
+        self.ar.append(&header, self.buf.as_slice()).await?;
+
+        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+
+        self.buf.clear();
+
+        Ok(())
+    }
+
+    async fn finish(mut self) -> anyhow::Result<()> {
+        if self.current_segment.is_none() || self.buf.is_empty() {
+            return Ok(());
+        }
+
+        self.flush().await
+    }
+}
+
 impl<'a, W> Basebackup<'a, W>
 where
     W: AsyncWrite + Send + Sync + Unpin,
@@ -168,20 +250,27 @@ where
         }
 
         // Gather non-relational files from object storage pages.
-        for kind in [
-            SlruKind::Clog,
-            SlruKind::MultiXactOffsets,
-            SlruKind::MultiXactMembers,
-        ] {
-            for segno in self
+        let slru_partitions = self
+            .timeline
+            .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+            .await?
+            .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
+
+        let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
+
+        for part in slru_partitions.parts {
+            let blocks = self
                 .timeline
-                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
-                .await?
-            {
-                self.add_slru_segment(kind, segno).await?;
+                .get_vectored(&part.ranges, self.lsn, self.ctx)
+                .await?;
+
+            for (key, block) in blocks {
+                slru_builder.add_block(&key, block?).await?;
             }
         }
 
+        slru_builder.finish().await?;
+
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
         for ((spcnode, dbnode), has_relmap_file) in
@@ -305,39 +394,6 @@ where
         Ok(())
     }
 
-    //
-    // Generate SLRU segment files from repository.
-    //
-    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = self
-            .timeline
-            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
-            .await?;
-
-        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
-        for blknum in 0..nblocks {
-            let img = self
-                .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
-                .await?;
-
-            if slru == SlruKind::Clog {
-                ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
-            } else {
-                ensure!(img.len() == BLCKSZ as usize);
-            }
-
-            slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
-        }
-
-        let segname = format!("{}/{:>04X}", slru.to_str(), segno);
-        let header = new_tar_header(&segname, slru_buf.len() as u64)?;
-        self.ar.append(&header, slru_buf.as_slice()).await?;
-
-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
-        Ok(())
-    }
-
     //
     // Include database/tablespace directories.
     //
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 15e3359c06..84de76e55e 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -130,7 +130,7 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors);
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 52277d7f24..1989bef817 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -36,6 +36,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
+use crate::virtual_file;
 use crate::{
     IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
     TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
@@ -43,6 +44,8 @@ use crate::{
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
 
+use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE;
+
 pub mod defaults {
     use crate::tenant::config::defaults::*;
     use const_format::formatcp;
@@ -79,6 +82,8 @@ pub mod defaults {
 
     pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
+    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
+
     ///
     /// Default built-in configuration file.
     ///
@@ -114,6 +119,8 @@ pub mod defaults {
 
 #ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
 
+#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -247,6 +254,8 @@ pub struct PageServerConf {
 
     /// Maximum number of WAL records to be ingested and committed at the same time
     pub ingest_batch_size: u64,
+
+    pub virtual_file_io_engine: virtual_file::IoEngineKind,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -331,6 +340,8 @@ struct PageServerConfigBuilder {
     secondary_download_concurrency: BuilderValue<usize>,
 
     ingest_batch_size: BuilderValue<u64>,
+
+    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -406,6 +417,8 @@ impl Default for PageServerConfigBuilder {
             secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
 
             ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
+
+            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
         }
     }
 }
@@ -562,6 +575,10 @@ impl PageServerConfigBuilder {
         self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
     }
 
+    pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) {
+        self.virtual_file_io_engine = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -669,6 +686,9 @@ impl PageServerConfigBuilder {
             ingest_batch_size: self
                 .ingest_batch_size
                 .ok_or(anyhow!("missing ingest_batch_size"))?,
+            virtual_file_io_engine: self
+                .virtual_file_io_engine
+                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
         })
     }
 }
@@ -920,6 +940,9 @@ impl PageServerConf {
                     builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
                 },
                 "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
+                "virtual_file_io_engine" => {
+                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -993,6 +1016,7 @@ impl PageServerConf {
             heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
             secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
             ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
         }
     }
 }
@@ -1225,6 +1249,7 @@ background_task_maximum_delay = '334 s'
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1288,6 +1313,7 @@ background_task_maximum_delay = '334 s'
                 heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
                 secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                 ingest_batch_size: 100,
+                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 1fbca1086f..a49eef8bb9 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -877,6 +877,56 @@ paths:
               schema:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
+  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        Marks the initdb archive for preservation upon deletion of the timeline or tenant.
+        This is meant to be part of the disaster recovery process.
+      responses:
+        "202":
+          description: Tenant scheduled to load successfully
+        "404":
+          description: No tenant or timeline found for the specified ids
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
 
   /v1/tenant/{tenant_id}/synthetic_size:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 811232397c..aa56806246 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,6 +187,7 @@ impl From<TenantSlotUpsertError> for ApiError {
         match e {
             InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
             MapState(e) => e.into(),
+            ShuttingDown(_) => ApiError::ShuttingDown,
         }
     }
 }
@@ -495,6 +496,10 @@ async fn timeline_create_handler(
                     .map_err(ApiError::InternalServerError)?;
                 json_response(StatusCode::CREATED, timeline_info)
             }
+            Err(_) if tenant.cancel.is_cancelled() => {
+                // In case we get some ugly error type during shutdown, cast it into a clean 503.
+                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
+            }
             Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
                 json_response(StatusCode::CONFLICT, ())
             }
@@ -561,6 +566,43 @@ async fn timeline_list_handler(
     json_response(StatusCode::OK, response_data)
 }
 
+async fn timeline_preserve_initdb_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    // Part of the process for disaster recovery from safekeeper-stored WAL:
+    // If we don't recover into a new timeline but want to keep the timeline ID,
+    // then the initdb archive is deleted. This endpoint copies it to a different
+    // location where timeline recreation cand find it.
+
+    async {
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(|e| ApiError::NotFound(e.into()))?;
+
+        timeline
+            .preserve_initdb_archive()
+            .await
+            .context("preserving initdb archive")
+            .map_err(ApiError::InternalServerError)?;
+
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_preserve_initdb_archive",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -1220,19 +1262,9 @@ async fn tenant_create_handler(
     };
     // We created the tenant. Existing API semantics are that the tenant
     // is Active when this function returns.
-    if let res @ Err(_) = new_tenant
+    new_tenant
         .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await
-    {
-        // This shouldn't happen because we just created the tenant directory
-        // in upsert_location, and there aren't any remote timelines
-        // to load, so, nothing can really fail during load.
-        // Don't do cleanup because we don't know how we got here.
-        // The tenant will likely be in `Broken` state and subsequent
-        // calls will fail.
-        res.context("created tenant failed to become active")
-            .map_err(ApiError::InternalServerError)?;
-    }
+        .await?;
 
     json_response(
         StatusCode::CREATED,
@@ -1943,6 +1975,10 @@ pub fn make_router(
         .post("/v1/tenant/:tenant_id/ignore", |r| {
             api_handler(r, tenant_ignore_handler)
         })
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
+            |r| api_handler(r, timeline_preserve_initdb_handler),
+        )
         .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
             api_handler(r, timeline_detail_handler)
         })
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 26070e0cc1..bcde1166b7 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,4 @@
+#![recursion_limit = "300"]
 #![deny(clippy::undocumented_unsafe_blocks)]
 
 mod auth;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 993685db6e..9b3679e3c2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) struct GetVectoredLatency {
+    map: EnumMap<TaskKind, Option<Histogram>>,
+}
+
+impl GetVectoredLatency {
+    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
+    // cardinality of the metric.
+    const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler];
+
+    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
+        self.map[task_kind].as_ref()
+    }
+}
+
+pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
+        "pageserver_get_vectored_seconds",
+        "Time spent in get_vectored",
+        &["task_kind"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric");
+
+    GetVectoredLatency {
+        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
+            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
+
+            if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
+                let task_kind = task_kind.into();
+                Some(inner.with_label_values(&[task_kind]))
+            } else {
+                None
+            }
+        })),
+    }
+});
+
 pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
@@ -932,6 +969,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+#[cfg(not(test))]
 pub(crate) mod virtual_file_descriptor_cache {
     use super::*;
 
@@ -951,6 +989,20 @@ pub(crate) mod virtual_file_descriptor_cache {
     // ```
 }
 
+#[cfg(not(test))]
+pub(crate) mod virtual_file_io_engine {
+    use super::*;
+
+    pub(crate) static KIND: Lazy<UIntGaugeVec> = Lazy::new(|| {
+        register_uint_gauge_vec!(
+            "pageserver_virtual_file_io_engine_kind",
+            "The configured io engine for VirtualFile",
+            &["kind"],
+        )
+        .unwrap()
+    });
+}
+
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
     global: Histogram,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 77ce9981f0..a8a3487b4e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::{rel_block_to_key, Version};
+use crate::pgdatadir_mapping::Version;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 
+use pageserver_api::key::rel_block_to_key;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
@@ -321,8 +322,8 @@ enum PageStreamError {
     Shutdown,
 
     /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error: {0}")]
-    Read(PageReconstructError),
+    #[error("Read error")]
+    Read(#[source] PageReconstructError),
 
     /// Ran out of time waiting for an LSN
     #[error("LSN timeout: {0}")]
@@ -331,11 +332,11 @@ enum PageStreamError {
     /// The entity required to serve the request (tenant or timeline) is not found,
     /// or is not found in a suitable state to serve a request.
     #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
+    NotFound(Cow<'static, str>),
 
     /// Request asked for something that doesn't make sense, like an invalid LSN
     #[error("Bad request: {0}")]
-    BadRequest(std::borrow::Cow<'static, str>),
+    BadRequest(Cow<'static, str>),
 }
 
 impl From<PageReconstructError> for PageStreamError {
@@ -666,7 +667,10 @@ impl PageServerHandler {
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
-                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
                         PagestreamBeMessage::Error(PagestreamErrorResponse {
                             message: e.to_string(),
                         })
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d9cc85319e..b65fe1eddd 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,7 +13,12 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::is_rel_block_key;
+use pageserver_api::key::{
+    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
+    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
+    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+};
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -22,6 +27,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
+use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
@@ -528,6 +534,33 @@ impl Timeline {
         Ok(Default::default())
     }
 
+    pub(crate) async fn get_slru_keyspace(
+        &self,
+        version: Version<'_>,
+        ctx: &RequestContext,
+    ) -> Result<KeySpace, PageReconstructError> {
+        let mut accum = KeySpaceAccum::new();
+
+        for kind in SlruKind::iter() {
+            let mut segments: Vec<u32> = self
+                .list_slru_segments(kind, version, ctx)
+                .await?
+                .into_iter()
+                .collect();
+            segments.sort_unstable();
+
+            for seg in segments {
+                let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
+
+                accum.add_range(
+                    slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
+                );
+            }
+        }
+
+        Ok(accum.to_keyspace())
+    }
+
     /// Get a list of SLRU segments
     pub(crate) async fn list_slru_segments(
         &self,
@@ -1535,366 +1568,6 @@ struct SlruSegmentDirectory {
 
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 
-// Layout of the Key address space
-//
-// The Key struct, used to address the underlying key-value store, consists of
-// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
-// all the data and metadata keys into those 18 bytes.
-//
-// Principles for the mapping:
-//
-// - Things that are often accessed or modified together, should be close to
-//   each other in the key space. For example, if a relation is extended by one
-//   block, we create a new key-value pair for the block data, and update the
-//   relation size entry. Because of that, the RelSize key comes after all the
-//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
-//   to each other.
-//
-// The key space is divided into four major sections, identified by the first
-// byte, and the form a hierarchy:
-//
-// 00 Relation data and metadata
-//
-//   DbDir    () -> (dbnode, spcnode)
-//   Filenodemap
-//   RelDir   -> relnode forknum
-//       RelBlocks
-//       RelSize
-//
-// 01 SLRUs
-//
-//   SlruDir  kind
-//   SlruSegBlocks segno
-//   SlruSegSize
-//
-// 02 pg_twophase
-//
-// 03 misc
-//    Controlfile
-//    checkpoint
-//    pg_version
-//
-// 04 aux files
-//
-// Below is a full list of the keyspace allocation:
-//
-// DbDir:
-// 00 00000000 00000000 00000000 00   00000000
-//
-// Filenodemap:
-// 00 SPCNODE  DBNODE   00000000 00   00000000
-//
-// RelDir:
-// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
-//
-// RelBlock:
-// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
-//
-// RelSize:
-// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
-//
-// SlruDir:
-// 01 kind     00000000 00000000 00   00000000
-//
-// SlruSegBlock:
-// 01 kind     00000001 SEGNO    00   BLKNUM
-//
-// SlruSegSize:
-// 01 kind     00000001 SEGNO    00   FFFFFFFF
-//
-// TwoPhaseDir:
-// 02 00000000 00000000 00000000 00   00000000
-//
-// TwoPhaseFile:
-// 02 00000000 00000000 00000000 00   XID
-//
-// ControlFile:
-// 03 00000000 00000000 00000000 00   00000000
-//
-// Checkpoint:
-// 03 00000000 00000000 00000000 00   00000001
-//
-// AuxFiles:
-// 03 00000000 00000000 00000000 00   00000002
-//
-
-//-- Section 01: relation data and metadata
-
-const DBDIR_KEY: Key = Key {
-    field1: 0x00,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0xffffffff,
-        field5: 0xff,
-        field6: 0xffffffff,
-    }
-}
-
-fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 1,
-    }
-}
-
-pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-
-fn rel_size_to_key(rel: RelTag) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0xffffffff,
-    }
-}
-
-fn rel_key_range(rel: RelTag) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum + 1,
-        field6: 0,
-    }
-}
-
-//-- Section 02: SLRUs
-
-fn slru_dir_to_key(kind: SlruKind) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: blknum,
-    }
-}
-
-fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0xffffffff,
-    }
-}
-
-fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
-    let field2 = match kind {
-        SlruKind::Clog => 0x00,
-        SlruKind::MultiXactMembers => 0x01,
-        SlruKind::MultiXactOffsets => 0x02,
-    };
-
-    Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 1,
-        field6: 0,
-    }
-}
-
-//-- Section 03: pg_twophase
-
-const TWOPHASEDIR_KEY: Key = Key {
-    field1: 0x02,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn twophase_file_key(xid: TransactionId) -> Key {
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }
-}
-
-fn twophase_key_range(xid: TransactionId) -> Range<Key> {
-    let (next_xid, overflowed) = xid.overflowing_add(1);
-
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }..Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: u8::from(overflowed),
-        field6: next_xid,
-    }
-}
-
-//-- Section 03: Control file
-const CONTROLFILE_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-const CHECKPOINT_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 1,
-};
-
-const AUX_FILES_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 2,
-};
-
-// Reverse mappings for a few Keys.
-// These are needed by WAL redo manager.
-
-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
-}
-
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
-
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
-
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
-
-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
-
-fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d9b91c9ce..7bb5881aab 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -91,7 +91,6 @@ use std::fs;
 use std::fs::File;
 use std::io;
 use std::ops::Bound::Included;
-use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
@@ -628,9 +627,15 @@ impl Tenant {
             deletion_queue_client,
         ));
 
+        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
+        // we shut down while attaching.
+        let Ok(attach_gate_guard) = tenant.gate.enter() else {
+            // We just created the Tenant: nothing else can have shut it down yet
+            unreachable!();
+        };
+
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
-
         let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
@@ -640,6 +645,8 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                let _gate_guard = attach_gate_guard;
+
                 // Is this tenant being spawned as part of process startup?
                 let starting_up = init_order.is_some();
                 scopeguard::defer! {
@@ -814,7 +821,7 @@ impl Tenant {
                     SpawnMode::Create => None,
                     SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
                 };
-                match tenant_clone.attach(preload, &ctx).await {
+                match tenant_clone.attach(preload, mode, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
                         if let Some(t)=  attach_timer {t.observe_duration();}
@@ -901,15 +908,20 @@ impl Tenant {
     async fn attach(
         self: &Arc<Tenant>,
         preload: Option<TenantPreload>,
+        mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
         failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
-        let preload = match preload {
-            Some(p) => p,
-            None => {
+        let preload = match (preload, mode) {
+            (Some(p), _) => p,
+            (None, SpawnMode::Create) => TenantPreload {
+                deleting: false,
+                timelines: HashMap::new(),
+            },
+            (None, SpawnMode::Normal) => {
                 // Deprecated dev mode: load from local disk state instead of remote storage
                 // https://github.com/neondatabase/neon/issues/5624
                 return self.load_local(ctx).await;
@@ -1017,7 +1029,10 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
-        failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
+        fail::fail_point!("attach-before-activate", |_| {
+            anyhow::bail!("attach-before-activate");
+        });
+        failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);
 
         info!("Done");
 
@@ -1681,9 +1696,13 @@ impl Tenant {
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
         if !self.is_active() {
-            return Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "Cannot create timelines on inactive tenant"
-            )));
+            if matches!(self.current_state(), TenantState::Stopping { .. }) {
+                return Err(CreateTimelineError::ShuttingDown);
+            } else {
+                return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "Cannot create timelines on inactive tenant"
+                )));
+            }
         }
 
         let _gate = self
@@ -3759,27 +3778,25 @@ async fn run_initdb(
         .env_clear()
         .env("LD_LIBRARY_PATH", &initdb_lib_dir)
         .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        // If the `select!` below doesn't finish the `wait_with_output`,
-        // let the task get `wait()`ed for asynchronously by tokio.
-        // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
-        // TODO: fix for this is non-trivial, see
-        // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
-        //
-        .kill_on_drop(true)
         .spawn()?;
 
-    tokio::select! {
-        initdb_output = initdb_command.wait_with_output() => {
-            let initdb_output = initdb_output?;
-            if !initdb_output.status.success() {
-                return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
-            }
-        }
-        _ = cancel.cancelled() => {
-            return Err(InitdbError::Cancelled);
-        }
+    // Ideally we'd select here with the cancellation token, but the problem is that
+    // we can't safely terminate initdb: it launches processes of its own, and killing
+    // initdb doesn't kill them. After we return from this function, we want the target
+    // directory to be able to be cleaned up.
+    // See https://github.com/neondatabase/neon/issues/6385
+    let initdb_output = initdb_command.wait_with_output().await?;
+    if !initdb_output.status.success() {
+        return Err(InitdbError::Failed(
+            initdb_output.status,
+            initdb_output.stderr,
+        ));
+    }
+
+    // This isn't true cancellation support, see above. Still return an error to
+    // excercise the cancellation code path.
+    if cancel.is_cancelled() {
+        return Err(InitdbError::Cancelled);
     }
 
     Ok(())
@@ -4035,7 +4052,7 @@ pub(crate) mod harness {
                         .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
                     tenant
-                        .attach(Some(preload), ctx)
+                        .attach(Some(preload), SpawnMode::Normal, ctx)
                         .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                         .await?;
                 }
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 0617017528..1b6bccc120 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,10 +5,10 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
-use std::ops::{Deref, DerefMut};
+use std::ops::Deref;
 
 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -39,6 +39,8 @@ pub enum BlockLease<'a> {
     EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
+    #[cfg(test)]
+    Vec(Vec<u8>),
 }
 
 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> {
             BlockLease::EphemeralFileMutableTail(v) => v,
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
+            #[cfg(test)]
+            BlockLease::Vec(v) => {
+                TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
+            }
         }
     }
 }
@@ -169,10 +175,14 @@ impl FileBlockReader {
     }
 
     /// Read a page from the underlying file into given buffer.
-    async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+    async fn fill_buffer(
+        &self,
+        buf: PageWriteGuard<'static>,
+        blkno: u32,
+    ) -> Result<PageWriteGuard<'static>, std::io::Error> {
         assert!(buf.len() == PAGE_SZ);
         self.file
-            .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
             .await
     }
     /// Read a block.
@@ -196,9 +206,9 @@ impl FileBlockReader {
                 )
             })? {
             ReadBufResult::Found(guard) => Ok(guard.into()),
-            ReadBufResult::NotFound(mut write_guard) => {
+            ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
-                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum).await?;
                 Ok(write_guard.mark_valid().into())
             }
         }
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index ecffd4e6c1..97de0cdcf9 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -409,7 +409,10 @@ impl DeleteTenantFlow {
             .await
             .expect("cant be stopping or broken");
 
-        tenant.attach(preload, ctx).await.context("attach")?;
+        tenant
+            .attach(preload, super::SpawnMode::Normal, ctx)
+            .await
+            .context("attach")?;
 
         Self::background(
             guard,
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 591eacd104..6b8cd77d78 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -5,11 +5,11 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
-use std::fs::OpenOptions;
+
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
@@ -47,7 +47,10 @@ impl EphemeralFile {
 
         let file = VirtualFile::open_with_options(
             &filename,
-            OpenOptions::new().read(true).write(true).create(true),
+            virtual_file::OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create(true),
         )
         .await?;
 
@@ -89,11 +92,10 @@ impl EphemeralFile {
                 page_cache::ReadBufResult::Found(guard) => {
                     return Ok(BlockLease::PageReadGuard(guard))
                 }
-                page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                    let buf: &mut [u8] = write_guard.deref_mut();
-                    debug_assert_eq!(buf.len(), PAGE_SZ);
-                    self.file
-                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                page_cache::ReadBufResult::NotFound(write_guard) => {
+                    let write_guard = self
+                        .file
+                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
                         .await?;
                     let read_guard = write_guard.mark_valid();
                     return Ok(BlockLease::PageReadGuard(read_guard));
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 9b6225501f..c31d401e84 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -283,15 +283,15 @@ impl LayerMap {
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
-    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
+    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
         if key.is_empty() {
             // Vacuously true. There's a newer image for all 0 of the kerys in the range.
-            return Ok(true);
+            return true;
         }
 
         let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
             Some(v) => v,
-            None => return Ok(false),
+            None => return false,
         };
 
         let start = key.start.to_i128();
@@ -304,17 +304,17 @@ impl LayerMap {
 
         // Check the start is covered
         if !layer_covers(version.image_coverage.query(start)) {
-            return Ok(false);
+            return false;
         }
 
         // Check after all changes of coverage
         for (_, change_val) in version.image_coverage.range(start..end) {
             if !layer_covers(change_val) {
-                return Ok(false);
+                return false;
             }
         }
 
-        Ok(true)
+        true
     }
 
     pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
@@ -325,18 +325,14 @@ impl LayerMap {
     /// Divide the whole given range of keys into sub-ranges based on the latest
     /// image layer that covers each range at the specified lsn (inclusive).
     /// This is used when creating  new image layers.
-    ///
-    // FIXME: clippy complains that the result type is very complex. She's probably
-    // right...
-    #[allow(clippy::type_complexity)]
     pub fn image_coverage(
         &self,
         key_range: &Range<Key>,
         lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
+    ) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
         let version = match self.historic.get().unwrap().get_version(lsn.0) {
             Some(v) => v,
-            None => return Ok(vec![]),
+            None => return vec![],
         };
 
         let start = key_range.start.to_i128();
@@ -359,7 +355,7 @@ impl LayerMap {
         let kr = Key::from_i128(current_key)..Key::from_i128(end);
         coverage.push((kr, current_val.take()));
 
-        Ok(coverage)
+        coverage
     }
 
     pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
@@ -410,24 +406,19 @@ impl LayerMap {
     /// This number is used to compute the largest number of deltas that
     /// we'll need to visit for any page reconstruction in this region.
     /// We use this heuristic to decide whether to create an image layer.
-    pub fn count_deltas(
-        &self,
-        key: &Range<Key>,
-        lsn: &Range<Lsn>,
-        limit: Option<usize>,
-    ) -> Result<usize> {
+    pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
         // We get the delta coverage of the region, and for each part of the coverage
         // we recurse right underneath the delta. The recursion depth is limited by
         // the largest result this function could return, which is in practice between
         // 3 and 10 (since we usually try to create an image when the number gets larger).
 
         if lsn.is_empty() || key.is_empty() || limit == Some(0) {
-            return Ok(0);
+            return 0;
         }
 
         let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
             Some(v) => v,
-            None => return Ok(0),
+            None => return 0,
         };
 
         let start = key.start.to_i128();
@@ -448,8 +439,7 @@ impl LayerMap {
                     if !kr.is_empty() {
                         let base_count = Self::is_reimage_worthy(&val, key) as usize;
                         let new_limit = limit.map(|l| l - base_count);
-                        let max_stacked_deltas_underneath =
-                            self.count_deltas(&kr, &lr, new_limit)?;
+                        let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                         max_stacked_deltas = std::cmp::max(
                             max_stacked_deltas,
                             base_count + max_stacked_deltas_underneath,
@@ -471,7 +461,7 @@ impl LayerMap {
                 if !kr.is_empty() {
                     let base_count = Self::is_reimage_worthy(&val, key) as usize;
                     let new_limit = limit.map(|l| l - base_count);
-                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                     max_stacked_deltas = std::cmp::max(
                         max_stacked_deltas,
                         base_count + max_stacked_deltas_underneath,
@@ -480,7 +470,7 @@ impl LayerMap {
             }
         }
 
-        Ok(max_stacked_deltas)
+        max_stacked_deltas
     }
 
     /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
@@ -592,10 +582,7 @@ impl LayerMap {
                     if limit == Some(difficulty) {
                         break;
                     }
-                    for (img_range, last_img) in self
-                        .image_coverage(range, lsn)
-                        .expect("why would this err?")
-                    {
+                    for (img_range, last_img) in self.image_coverage(range, lsn) {
                         if limit == Some(difficulty) {
                             break;
                         }
@@ -606,9 +593,7 @@ impl LayerMap {
                         };
 
                         if img_lsn < lsn {
-                            let num_deltas = self
-                                .count_deltas(&img_range, &(img_lsn..lsn), limit)
-                                .expect("why would this err lol?");
+                            let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
                             difficulty = std::cmp::max(difficulty, num_deltas);
                         }
                     }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 84c7a20247..32535e0134 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
@@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
-    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
+    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
+    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
@@ -466,6 +468,26 @@ pub async fn init_tenant_mgr(
             // We have a generation map: treat it as the authority for whether
             // this tenant is really attached.
             if let Some(gen) = generations.get(&tenant_shard_id) {
+                if let LocationMode::Attached(attached) = &location_conf.mode {
+                    if attached.generation > *gen {
+                        tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            attached.generation
+                        );
+
+                        // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
+                        // local disk content: demote to secondary rather than detaching.
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
+                                &SecondaryLocationConfig { warm: false },
+                            )),
+                        );
+                    }
+                }
                 *gen
             } else {
                 match &location_conf.mode {
@@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
         tokio::select! {
             Some(joined) = join_set.join_next() => {
                 match joined {
-                    Ok(()) => {}
+                    Ok(()) => {},
                     Err(join_error) if join_error.is_cancelled() => {
                         unreachable!("we are not cancelling any of the tasks");
                     }
@@ -882,7 +904,7 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
         new_location_config: LocationConf,
         flush: Option<Duration>,
-        spawn_mode: SpawnMode,
+        mut spawn_mode: SpawnMode,
         ctx: &RequestContext,
     ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
         debug_assert_current_span_has_tenant_id();
@@ -902,19 +924,29 @@ impl TenantManager {
                 tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
             match (&new_location_config.mode, peek_slot) {
                 (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                    if attach_conf.generation == tenant.generation {
-                        // A transition from Attached to Attached in the same generation, we may
-                        // take our fast path and just provide the updated configuration
-                        // to the tenant.
-                        tenant.set_new_location_config(
-                            AttachedTenantConf::try_from(new_location_config.clone())
-                                .map_err(UpsertLocationError::BadRequest)?,
-                        );
+                    match attach_conf.generation.cmp(&tenant.generation) {
+                        Ordering::Equal => {
+                            // A transition from Attached to Attached in the same generation, we may
+                            // take our fast path and just provide the updated configuration
+                            // to the tenant.
+                            tenant.set_new_location_config(
+                                AttachedTenantConf::try_from(new_location_config.clone())
+                                    .map_err(UpsertLocationError::BadRequest)?,
+                            );
 
-                        Some(FastPathModified::Attached(tenant.clone()))
-                    } else {
-                        // Different generations, fall through to general case
-                        None
+                            Some(FastPathModified::Attached(tenant.clone()))
+                        }
+                        Ordering::Less => {
+                            return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
+                                "Generation {:?} is less than existing {:?}",
+                                attach_conf.generation,
+                                tenant.generation
+                            )));
+                        }
+                        Ordering::Greater => {
+                            // Generation advanced, fall through to general case of replacing `Tenant` object
+                            None
+                        }
                     }
                 }
                 (
@@ -1019,6 +1051,12 @@ impl TenantManager {
                     }
                 }
                 slot_guard.drop_old_value().expect("We just shut it down");
+
+                // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
+                // the caller thinks they're creating but the tenant already existed.  We must switch to
+                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // rather than assuming it to be empty.
+                spawn_mode = SpawnMode::Normal;
             }
             Some(TenantSlot::Secondary(state)) => {
                 info!("Shutting down secondary tenant");
@@ -1102,14 +1140,46 @@ impl TenantManager {
             None
         };
 
-        slot_guard.upsert(new_slot).map_err(|e| match e {
-            TenantSlotUpsertError::InternalError(e) => {
-                UpsertLocationError::Other(anyhow::anyhow!(e))
+        match slot_guard.upsert(new_slot) {
+            Err(TenantSlotUpsertError::InternalError(e)) => {
+                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
             }
-            TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
-        })?;
+            Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
+            Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
+                // If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
+                // we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
+                // are shutdown.
+                //
+                // We must shut it down inline here.
+                match new_slot {
+                    TenantSlot::InProgress(_) => {
+                        // Unreachable because we never insert an InProgress
+                        unreachable!()
+                    }
+                    TenantSlot::Attached(tenant) => {
+                        let (_guard, progress) = utils::completion::channel();
+                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
+                        match tenant.shutdown(progress, false).await {
+                            Ok(()) => {
+                                info!("Finished shutting down just-spawned tenant");
+                            }
+                            Err(barrier) => {
+                                info!("Shutdown already in progress, waiting for it to complete");
+                                barrier.wait().await;
+                            }
+                        }
+                    }
+                    TenantSlot::Secondary(secondary_tenant) => {
+                        secondary_tenant.shutdown().await;
+                    }
+                }
 
-        Ok(attached_tenant)
+                Err(UpsertLocationError::Unavailable(
+                    TenantMapError::ShuttingDown,
+                ))
+            }
+            Ok(()) => Ok(attached_tenant),
+        }
     }
 
     /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1728,14 +1798,31 @@ pub(crate) enum TenantSlotError {
 
 /// Superset of TenantMapError: issues that can occur when using a SlotGuard
 /// to insert a new value.
-#[derive(Debug, thiserror::Error)]
-pub enum TenantSlotUpsertError {
+#[derive(thiserror::Error)]
+pub(crate) enum TenantSlotUpsertError {
     /// An error where the slot is in an unexpected state, indicating a code bug
     #[error("Internal error updating Tenant")]
     InternalError(Cow<'static, str>),
 
     #[error(transparent)]
-    MapState(#[from] TenantMapError),
+    MapState(TenantMapError),
+
+    // If we encounter TenantManager shutdown during upsert, we must carry the Completion
+    // from the SlotGuard, so that the caller can hold it while they clean up: otherwise
+    // TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
+    // was protected by the SlotGuard.
+    #[error("Shutting down")]
+    ShuttingDown((TenantSlot, utils::completion::Completion)),
+}
+
+impl std::fmt::Debug for TenantSlotUpsertError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
+            Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
+            Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1784,7 +1871,7 @@ pub struct SlotGuard {
 
     /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
     /// release any waiters as soon as this SlotGuard is dropped.
-    _completion: utils::completion::Completion,
+    completion: utils::completion::Completion,
 }
 
 impl SlotGuard {
@@ -1797,7 +1884,7 @@ impl SlotGuard {
             tenant_shard_id,
             old_value,
             upserted: false,
-            _completion: completion,
+            completion,
         }
     }
 
@@ -1830,9 +1917,16 @@ impl SlotGuard {
             }
 
             let m = match &mut *locked {
-                TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
+                TenantsMap::Initializing => {
+                    return Err(TenantSlotUpsertError::MapState(
+                        TenantMapError::StillInitializing,
+                    ))
+                }
                 TenantsMap::ShuttingDown(_) => {
-                    return Err(TenantMapError::ShuttingDown.into());
+                    return Err(TenantSlotUpsertError::ShuttingDown((
+                        new_value,
+                        self.completion.clone(),
+                    )));
                 }
                 TenantsMap::Open(m) => m,
             };
@@ -1880,7 +1974,9 @@ impl SlotGuard {
                 Err(TenantSlotUpsertError::InternalError(_)) => {
                     // We already logged the error, nothing else we can do.
                 }
-                Err(TenantSlotUpsertError::MapState(_)) => {
+                Err(
+                    TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
+                ) => {
                     // If the map is shutting down, we need not replace anything
                 }
                 Ok(()) => {}
@@ -1978,18 +2074,22 @@ fn tenant_map_peek_slot<'a>(
     tenant_shard_id: &TenantShardId,
     mode: TenantSlotPeekMode,
 ) -> Result<Option<&'a TenantSlot>, TenantMapError> {
-    let m = match tenants.deref() {
-        TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
+    match tenants.deref() {
+        TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
         TenantsMap::ShuttingDown(m) => match mode {
-            TenantSlotPeekMode::Read => m,
-            TenantSlotPeekMode::Write => {
-                return Err(TenantMapError::ShuttingDown);
-            }
+            TenantSlotPeekMode::Read => Ok(Some(
+                // When reading in ShuttingDown state, we must translate None results
+                // into a ShuttingDown error, because absence of a tenant shard ID in the map
+                // isn't a reliable indicator of the tenant being gone: it might have been
+                // InProgress when shutdown started, and cleaned up from that state such
+                // that it's now no longer in the map.  Callers will have to wait until
+                // we next start up to get a proper answer.  This avoids incorrect 404 API responses.
+                m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
+            )),
+            TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
         },
-        TenantsMap::Open(m) => m,
-    };
-
-    Ok(m.get(tenant_shard_id))
+        TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
+    }
 }
 
 enum TenantSlotAcquireMode {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1b5f861c90..80ff5c9a2d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -257,6 +257,8 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
 
+pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
+
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 
@@ -1066,6 +1068,28 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    pub(crate) async fn preserve_initdb_archive(
+        self: &Arc<Self>,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        backoff::retry(
+            || async {
+                upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "preserve_initdb_tar_zst",
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
+        )
+        .await
+        .context("backing up initdb archive")?;
+        Ok(())
+    }
+
     /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
     /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
     /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1101,6 +1125,14 @@ impl RemoteTimelineClient {
         let layer_deletion_count = layers.len();
         self.deletion_queue_client.push_immediate(layers).await?;
 
+        // Delete the initdb.tar.zst, which is not always present, but deletion attempts of
+        // inexistant objects are not considered errors.
+        let initdb_path =
+            remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
+        self.deletion_queue_client
+            .push_immediate(vec![initdb_path])
+            .await?;
+
         // Do not delete index part yet, it is needed for possible retry. If we remove it first
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
         let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
@@ -1148,10 +1180,8 @@ impl RemoteTimelineClient {
                 if p == &latest_index {
                     return false;
                 }
-                if let Some(name) = p.object_name() {
-                    if name == INITDB_PATH {
-                        return false;
-                    }
+                if p.object_name() == Some(INITDB_PRESERVED_PATH) {
+                    return false;
                 }
                 true
             })
@@ -1724,6 +1754,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
     .expect("Failed to construct path")
 }
 
+pub fn remote_initdb_preserved_archive_path(
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+) -> RemotePath {
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}"
+    ))
+    .expect("Failed to construct path")
+}
+
 pub fn remote_index_path(
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d3956163c8..4309c683e2 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -32,7 +32,8 @@ use utils::id::TimelineId;
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
     parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+    INITDB_PATH,
 };
 
 ///
@@ -430,6 +431,9 @@ pub(crate) async fn download_initdb_tar_zst(
 
     let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
 
+    let remote_preserved_path =
+        remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id);
+
     let timeline_path = conf.timelines_path(tenant_shard_id);
 
     if !timeline_path.exists() {
@@ -456,8 +460,16 @@ pub(crate) async fn download_initdb_tar_zst(
                 .with_context(|| format!("tempfile creation {temp_path}"))
                 .map_err(DownloadError::Other)?;
 
-            let download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
+                .await
+            {
+                Ok(dl) => dl,
+                Err(DownloadError::NotFound) => {
+                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
+                        .await?
+                }
+                Err(other) => Err(other)?,
+            };
             let mut download = tokio_util::io::StreamReader::new(download.download_stream);
             let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 11c6956875..58d95f75c2 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -13,8 +13,8 @@ use super::Generation;
 use crate::{
     config::PageServerConf,
     tenant::remote_timeline_client::{
-        index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
-        upload_cancellable,
+        index::IndexPart, remote_index_path, remote_initdb_archive_path,
+        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
     },
 };
 use remote_storage::GenericRemoteStorage;
@@ -144,3 +144,16 @@ pub(crate) async fn upload_initdb_dir(
     .await
     .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
+
+pub(crate) async fn preserve_initdb_archive(
+    storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
+    let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
+    upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
+        .await
+        .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
+}
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 4ded6d6a8d..3a445ef71e 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -36,7 +36,7 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
@@ -649,7 +649,7 @@ impl DeltaLayer {
     {
         let file = VirtualFile::open_with_options(
             path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f03c7642eb..c62e6aed51 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -34,7 +34,7 @@ use crate::tenant::storage_layer::{
     LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::Timeline;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
@@ -327,7 +327,7 @@ impl ImageLayer {
     {
         let file = VirtualFile::open_with_options(
             path,
-            &*std::fs::OpenOptions::new().read(true).write(true),
+            virtual_file::OpenOptions::new().read(true).write(true),
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -492,11 +492,15 @@ impl ImageLayerWriterInner {
             },
         );
         info!("new image layer {path}");
-        let mut file = VirtualFile::open_with_options(
-            &path,
-            std::fs::OpenOptions::new().write(true).create_new(true),
-        )
-        .await?;
+        let mut file = {
+            VirtualFile::open_with_options(
+                &path,
+                virtual_file::OpenOptions::new()
+                    .write(true)
+                    .create_new(true),
+            )
+            .await?
+        };
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 2b2fcc7711..5f39c46a84 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                     );
                     error_run_count += 1;
                     let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    log_compaction_error(
+                        &e,
+                        error_run_count,
+                        &wait_duration,
+                        cancel.is_cancelled(),
                     );
                     wait_duration
                 } else {
@@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }
 
+fn log_compaction_error(
+    e: &CompactionError,
+    error_run_count: u32,
+    sleep_duration: &std::time::Duration,
+    task_cancelled: bool,
+) {
+    use crate::tenant::upload_queue::NotInitialized;
+    use crate::tenant::PageReconstructError;
+    use CompactionError::*;
+
+    enum LooksLike {
+        Info,
+        Error,
+    }
+
+    let decision = match e {
+        ShuttingDown => None,
+        _ if task_cancelled => Some(LooksLike::Info),
+        Other(e) => {
+            let root_cause = e.root_cause();
+
+            let is_stopping = {
+                let upload_queue = root_cause
+                    .downcast_ref::<NotInitialized>()
+                    .is_some_and(|e| e.is_stopping());
+
+                let timeline = root_cause
+                    .downcast_ref::<PageReconstructError>()
+                    .is_some_and(|e| e.is_stopping());
+
+                upload_queue || timeline
+            };
+
+            if is_stopping {
+                Some(LooksLike::Info)
+            } else {
+                Some(LooksLike::Error)
+            }
+        }
+    };
+
+    match decision {
+        Some(LooksLike::Info) => info!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
+        ),
+        Some(LooksLike::Error) => error!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
+        ),
+        None => {}
+    }
+}
+
 ///
 /// GC task's main loop
 ///
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0cb7cf26f2..70c6ee2042 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::{
+    keyspace::{key_range_size, KeySpaceAccum},
     models::{
         DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
         LayerMapInfo, TimelineState,
@@ -32,7 +33,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::Gate;
 
-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -73,8 +74,8 @@ use crate::metrics::{
     TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
-use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::tenant::config::TenantConfOpt;
+use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
 
@@ -391,8 +392,7 @@ pub(crate) enum PageReconstructError {
     #[error("Ancestor LSN wait error: {0}")]
     AncestorLsnTimeout(#[from] WaitLsnError),
 
-    /// The operation was cancelled
-    #[error("Cancelled")]
+    #[error("timeline shutting down")]
     Cancelled,
 
     /// The ancestor of this is being stopped
@@ -404,6 +404,34 @@ pub(crate) enum PageReconstructError {
     WalRedo(anyhow::Error),
 }
 
+impl PageReconstructError {
+    /// Returns true if this error indicates a tenant/timeline shutdown alike situation
+    pub(crate) fn is_stopping(&self) -> bool {
+        use PageReconstructError::*;
+        match self {
+            Other(_) => false,
+            AncestorLsnTimeout(_) => false,
+            Cancelled | AncestorStopping(_) => true,
+            WalRedo(_) => false,
+        }
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+enum CreateImageLayersError {
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error(transparent)]
+    GetVectoredError(GetVectoredError),
+
+    #[error(transparent)]
+    PageReconstructError(PageReconstructError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 #[derive(thiserror::Error, Debug)]
 enum FlushLayerError {
     /// Timeline cancellation token was cancelled
@@ -411,12 +439,24 @@ enum FlushLayerError {
     Cancelled,
 
     #[error(transparent)]
-    PageReconstructError(#[from] PageReconstructError),
+    CreateImageLayersError(CreateImageLayersError),
 
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GetVectoredError {
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
+    Oversized(u64),
+
+    #[error("Requested at invalid LSN: {0}")]
+    InvalidLsn(Lsn),
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
     Initial,
@@ -456,6 +496,45 @@ pub(crate) enum WaitLsnError {
     Timeout(String),
 }
 
+// The impls below achieve cancellation mapping for errors.
+// Perhaps there's a way of achieving this with less cruft.
+
+impl From<CreateImageLayersError> for CompactionError {
+    fn from(e: CreateImageLayersError) -> Self {
+        match e {
+            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            _ => CompactionError::Other(e.into()),
+        }
+    }
+}
+
+impl From<CreateImageLayersError> for FlushLayerError {
+    fn from(e: CreateImageLayersError) -> Self {
+        match e {
+            CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
+            any => FlushLayerError::CreateImageLayersError(any),
+        }
+    }
+}
+
+impl From<PageReconstructError> for CreateImageLayersError {
+    fn from(e: PageReconstructError) -> Self {
+        match e {
+            PageReconstructError::Cancelled => CreateImageLayersError::Cancelled,
+            _ => CreateImageLayersError::PageReconstructError(e),
+        }
+    }
+}
+
+impl From<GetVectoredError> for CreateImageLayersError {
+    fn from(e: GetVectoredError) -> Self {
+        match e {
+            GetVectoredError::Cancelled => CreateImageLayersError::Cancelled,
+            _ => CreateImageLayersError::GetVectoredError(e),
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -575,6 +654,57 @@ impl Timeline {
         res
     }
 
+    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+
+    /// Look up multiple page versions at a given LSN
+    ///
+    /// This naive implementation will be replaced with a more efficient one
+    /// which actually vectorizes the read path.
+    pub(crate) async fn get_vectored(
+        &self,
+        key_ranges: &[Range<Key>],
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if !lsn.is_valid() {
+            return Err(GetVectoredError::InvalidLsn(lsn));
+        }
+
+        let key_count = key_ranges
+            .iter()
+            .map(|range| key_range_size(range) as u64)
+            .sum();
+        if key_count > Timeline::MAX_GET_VECTORED_KEYS {
+            return Err(GetVectoredError::Oversized(key_count));
+        }
+
+        let _timer = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|t| t.start_timer());
+
+        let mut values = BTreeMap::new();
+        for range in key_ranges {
+            let mut key = range.start;
+            while key != range.end {
+                assert!(!self.shard_identity.is_key_disposable(&key));
+
+                let block = self.get(key, lsn, ctx).await;
+
+                if matches!(
+                    block,
+                    Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+                ) {
+                    return Err(GetVectoredError::Cancelled);
+                }
+
+                values.insert(key, block);
+                key = key.next();
+            }
+        }
+
+        Ok(values)
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last
@@ -2582,7 +2712,7 @@ impl Timeline {
                         return;
                     }
                     err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
+                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                     ) => {
                         error!("could not flush frozen layer: {err:?}");
                         break err;
@@ -2859,6 +2989,21 @@ impl Timeline {
         Ok(())
     }
 
+    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
+        if let Some(remote_client) = &self.remote_client {
+            remote_client
+                .preserve_initdb_archive(
+                    &self.tenant_shard_id.tenant_id,
+                    &self.timeline_id,
+                    &self.cancel,
+                )
+                .await?;
+        } else {
+            bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
+        }
+        Ok(())
+    }
+
     // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
     // in layer map immediately. The caller is responsible to put it into the layer map.
     async fn create_delta_layer(
@@ -2950,11 +3095,7 @@ impl Timeline {
     }
 
     // Is it time to create a new image layer for the given partition?
-    async fn time_for_new_image_layer(
-        &self,
-        partition: &KeySpace,
-        lsn: Lsn,
-    ) -> anyhow::Result<bool> {
+    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read().await;
@@ -2974,20 +3115,20 @@ impl Timeline {
                     // but the range is already covered by image layers at more recent LSNs. Before we
                     // create a new image layer, check if the range is already covered at more recent LSNs.
                     if !layers
-                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
+                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
                     {
                         debug!(
                             "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
                             img_range.start, img_range.end, cutoff_lsn, lsn
                         );
-                        return Ok(true);
+                        return true;
                     }
                 }
             }
         }
 
         for part_range in &partition.ranges {
-            let image_coverage = layers.image_coverage(part_range, lsn)?;
+            let image_coverage = layers.image_coverage(part_range, lsn);
             for (img_range, last_img) in image_coverage {
                 let img_lsn = if let Some(last_img) = last_img {
                     last_img.get_lsn_range().end
@@ -3008,7 +3149,7 @@ impl Timeline {
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
                     let num_deltas =
-                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold));
 
                     max_deltas = max_deltas.max(num_deltas);
                     if num_deltas >= threshold {
@@ -3016,7 +3157,7 @@ impl Timeline {
                             "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                             img_range.start, img_range.end, num_deltas, img_lsn, lsn
                         );
-                        return Ok(true);
+                        return true;
                     }
                 }
             }
@@ -3026,7 +3167,7 @@ impl Timeline {
             max_deltas,
             "none of the partitioned ranges had >= {threshold} deltas"
         );
-        Ok(false)
+        false
     }
 
     #[tracing::instrument(skip_all, fields(%lsn, %force))]
@@ -3036,7 +3177,7 @@ impl Timeline {
         lsn: Lsn,
         force: bool,
         ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, PageReconstructError> {
+    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers = Vec::new();
 
@@ -3054,7 +3195,7 @@ impl Timeline {
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
             start = img_range.end;
-            if force || self.time_for_new_image_layer(partition, lsn).await? {
+            if force || self.time_for_new_image_layer(partition, lsn).await {
                 let mut image_layer_writer = ImageLayerWriter::new(
                     self.conf,
                     self.timeline_id,
@@ -3065,10 +3206,12 @@ impl Timeline {
                 .await?;
 
                 fail_point!("image-layer-writer-fail-before-finish", |_| {
-                    Err(PageReconstructError::Other(anyhow::anyhow!(
+                    Err(CreateImageLayersError::Other(anyhow::anyhow!(
                         "failpoint image-layer-writer-fail-before-finish"
                     )))
                 });
+
+                let mut key_request_accum = KeySpaceAccum::new();
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
@@ -3081,34 +3224,55 @@ impl Timeline {
                             key = key.next();
                             continue;
                         }
-                        let img = match self.get(key, lsn, ctx).await {
-                            Ok(img) => img,
-                            Err(err) => {
-                                // If we fail to reconstruct a VM or FSM page, we can zero the
-                                // page without losing any actual user data. That seems better
-                                // than failing repeatedly and getting stuck.
-                                //
-                                // We had a bug at one point, where we truncated the FSM and VM
-                                // in the pageserver, but the Postgres didn't know about that
-                                // and continued to generate incremental WAL records for pages
-                                // that didn't exist in the pageserver. Trying to replay those
-                                // WAL records failed to find the previous image of the page.
-                                // This special case allows us to recover from that situation.
-                                // See https://github.com/neondatabase/neon/issues/2601.
-                                //
-                                // Unfortunately we cannot do this for the main fork, or for
-                                // any metadata keys, keys, as that would lead to actual data
-                                // loss.
-                                if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                                    warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                                    ZERO_PAGE.clone()
-                                } else {
-                                    return Err(err);
-                                }
-                            }
-                        };
 
-                        image_layer_writer.put_image(key, &img).await?;
+                        key_request_accum.add_key(key);
+                        if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                            || key.next() == range.end
+                        {
+                            let results = self
+                                .get_vectored(
+                                    &key_request_accum.consume_keyspace().ranges,
+                                    lsn,
+                                    ctx,
+                                )
+                                .await?;
+
+                            for (img_key, img) in results {
+                                let img = match img {
+                                    Ok(img) => img,
+                                    Err(err) => {
+                                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                                        // page without losing any actual user data. That seems better
+                                        // than failing repeatedly and getting stuck.
+                                        //
+                                        // We had a bug at one point, where we truncated the FSM and VM
+                                        // in the pageserver, but the Postgres didn't know about that
+                                        // and continued to generate incremental WAL records for pages
+                                        // that didn't exist in the pageserver. Trying to replay those
+                                        // WAL records failed to find the previous image of the page.
+                                        // This special case allows us to recover from that situation.
+                                        // See https://github.com/neondatabase/neon/issues/2601.
+                                        //
+                                        // Unfortunately we cannot do this for the main fork, or for
+                                        // any metadata keys, keys, as that would lead to actual data
+                                        // loss.
+                                        if is_rel_fsm_block_key(img_key)
+                                            || is_rel_vm_block_key(img_key)
+                                        {
+                                            warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                            ZERO_PAGE.clone()
+                                        } else {
+                                            return Err(
+                                                CreateImageLayersError::PageReconstructError(err),
+                                            );
+                                        }
+                                    }
+                                };
+
+                                image_layer_writer.put_image(img_key, &img).await?;
+                            }
+                        }
+
                         key = key.next();
                     }
                 }
@@ -3484,7 +3648,7 @@ impl Timeline {
                     // has not so much sense, because largest holes will corresponds field1/field2 changes.
                     // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
                     // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
                     if coverage_size >= min_hole_coverage_size {
                         heap.push(Hole {
                             key_range,
@@ -4110,7 +4274,7 @@ impl Timeline {
             // we cannot remove C, even though it's older than 2500, because
             // the delta layer 2000-3000 depends on it.
             if !layers
-                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
+                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
             {
                 debug!("keeping {} because it is the latest layer", l.filename());
                 // Collect delta key ranges that need image layers to allow garbage
@@ -4240,7 +4404,7 @@ impl Timeline {
                     .walredo_mgr
                     .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                     .await
-                    .context("Failed to reconstruct a page image:")
+                    .context("reconstruct a page image")
                 {
                     Ok(img) => img,
                     Err(e) => return Err(PageReconstructError::WalRedo(e)),
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 32f14f40c5..0b61bc0a10 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotInitialized {
+    #[error("queue is in state Uninitialized")]
+    Uninitialized,
+    #[error("queue is in state Stopping")]
+    Stopped,
+    #[error("queue is shutting down")]
+    ShuttingDown,
+}
+
+impl NotInitialized {
+    pub(crate) fn is_stopping(&self) -> bool {
+        use NotInitialized::*;
+        match self {
+            Uninitialized => false,
+            Stopped => true,
+            ShuttingDown => true,
+        }
+    }
+}
+
 impl UploadQueue {
     pub(crate) fn initialize_empty_remote(
         &mut self,
@@ -214,17 +235,17 @@ impl UploadQueue {
     }
 
     pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        use UploadQueue::*;
         match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => {
-                if !x.shutting_down {
-                    Ok(x)
+            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Initialized(x) => {
+                if x.shutting_down {
+                    Err(NotInitialized::ShuttingDown.into())
                 } else {
-                    anyhow::bail!("queue is shutting down")
+                    Ok(x)
                 }
             }
+            Stopped(_) => Err(NotInitialized::Stopped.into()),
         }
     }
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 9feefd8a32..066f06c88f 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -11,18 +11,28 @@
 //! src/backend/storage/file/fd.c
 //!
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
+
+use crate::page_cache::PageWriteGuard;
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
-use std::fs::{self, File, OpenOptions};
+use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+use tokio_epoll_uring::IoBufMut;
+
+use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use utils::fs_ext;
 
+mod io_engine;
+mod open_options;
+pub use io_engine::IoEngineKind;
+pub(crate) use open_options::*;
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -106,7 +116,38 @@ struct SlotInner {
     tag: u64,
 
     /// the underlying file
-    file: Option<File>,
+    file: Option<OwnedFd>,
+}
+
+/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
+struct PageWriteGuardBuf {
+    page: PageWriteGuard<'static>,
+    init_up_to: usize,
+}
+// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
+// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
+unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
+    fn stable_ptr(&self) -> *const u8 {
+        self.page.as_ptr()
+    }
+    fn bytes_init(&self) -> usize {
+        self.init_up_to
+    }
+    fn bytes_total(&self) -> usize {
+        self.page.len()
+    }
+}
+// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access,
+// hence it's safe to hand out the `stable_mut_ptr()`.
+unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.page.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, pos: usize) {
+        assert!(pos <= self.page.len());
+        self.init_up_to = pos;
+    }
 }
 
 impl OpenFiles {
@@ -274,6 +315,10 @@ macro_rules! with_file {
         let $ident = $this.lock_file().await?;
         observe_duration!($op, $($body)*)
     }};
+    ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{
+        let mut $ident = $this.lock_file().await?;
+        observe_duration!($op, $($body)*)
+    }};
 }
 
 impl VirtualFile {
@@ -326,7 +371,9 @@ impl VirtualFile {
         // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
         // where our caller doesn't get to use the returned VirtualFile before its
         // slot gets re-used by someone else.
-        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;
+        let file = observe_duration!(StorageIoOperation::Open, {
+            open_options.open(path.as_std_path()).await?
+        });
 
         // Strip all options other than read and write.
         //
@@ -400,15 +447,13 @@ impl VirtualFile {
 
     /// Call File::sync_all() on the underlying File.
     pub async fn sync_all(&self) -> Result<(), Error> {
-        with_file!(self, StorageIoOperation::Fsync, |file| file
-            .as_ref()
-            .sync_all())
+        with_file!(self, StorageIoOperation::Fsync, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.sync_all()))
     }
 
     pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        with_file!(self, StorageIoOperation::Metadata, |file| file
-            .as_ref()
-            .metadata())
+        with_file!(self, StorageIoOperation::Metadata, |file_guard| file_guard
+            .with_std_file(|std_file| std_file.metadata()))
     }
 
     /// Helper function internal to `VirtualFile` that looks up the underlying File,
@@ -417,7 +462,7 @@ impl VirtualFile {
     ///
     /// We are doing it via a macro as Rust doesn't support async closures that
     /// take on parameters with lifetimes.
-    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
+    async fn lock_file(&self) -> Result<FileGuard, Error> {
         let open_files = get_open_files();
 
         let mut handle_guard = {
@@ -463,10 +508,9 @@ impl VirtualFile {
         // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
         // case from StorageIoOperation::Open. This helps with identifying thrashing
         // of the virtual file descriptor cache.
-        let file = observe_duration!(
-            StorageIoOperation::OpenAfterReplace,
-            self.open_options.open(&self.path)
-        )?;
+        let file = observe_duration!(StorageIoOperation::OpenAfterReplace, {
+            self.open_options.open(self.path.as_std_path()).await?
+        });
 
         // Store the File in the slot and update the handle in the VirtualFile
         // to point to it.
@@ -491,9 +535,8 @@ impl VirtualFile {
                 self.pos = offset;
             }
             SeekFrom::End(offset) => {
-                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
-                    .as_ref()
-                    .seek(SeekFrom::End(offset)))?
+                self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard
+                    .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))?
             }
             SeekFrom::Current(offset) => {
                 let pos = self.pos as i128 + offset as i128;
@@ -512,25 +555,28 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-    pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
-        while !buf.is_empty() {
-            match self.read_at(buf, offset).await {
-                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::UnexpectedEof,
-                        "failed to fill whole buffer",
-                    ))
-                }
-                Ok(n) => {
-                    buf = &mut buf[n..];
-                    offset += n as u64;
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-        }
-        Ok(())
+    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) =
+            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        let buf = PageWriteGuardBuf {
+            page,
+            init_up_to: 0,
+        };
+        let res = self.read_exact_at(buf, offset).await;
+        res.map(|PageWriteGuardBuf { page, .. }| page)
+            .map_err(|e| Error::new(ErrorKind::Other, e))
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -580,22 +626,35 @@ impl VirtualFile {
         Ok(n)
     }
 
-    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Read, |file| file
-            .as_ref()
-            .read_at(buf, offset));
-        if let Ok(size) = result {
-            STORAGE_IO_SIZE
-                .with_label_values(&["read", &self.tenant_id, &self.shard_id, &self.timeline_id])
-                .add(size as i64);
-        }
-        result
+    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        let file_guard = match self.lock_file().await {
+            Ok(file_guard) => file_guard,
+            Err(e) => return (buf, Err(e)),
+        };
+
+        observe_duration!(StorageIoOperation::Read, {
+            let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
+            if let Ok(size) = res {
+                STORAGE_IO_SIZE
+                    .with_label_values(&[
+                        "read",
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .add(size as i64);
+            }
+            (buf, res)
+        })
     }
 
     async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = with_file!(self, StorageIoOperation::Write, |file| file
-            .as_ref()
-            .write_at(buf, offset));
+        let result = with_file!(self, StorageIoOperation::Write, |file_guard| {
+            file_guard.with_std_file(|std_file| std_file.write_at(buf, offset))
+        });
         if let Ok(size) = result {
             STORAGE_IO_SIZE
                 .with_label_values(&["write", &self.tenant_id, &self.shard_id, &self.timeline_id])
@@ -605,18 +664,241 @@ impl VirtualFile {
     }
 }
 
-struct FileGuard<'a> {
-    slot_guard: RwLockReadGuard<'a, SlotInner>,
+// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
+pub async fn read_exact_at_impl<B, F, Fut>(
+    buf: B,
+    mut offset: u64,
+    mut read_at: F,
+) -> (B, std::io::Result<()>)
+where
+    B: IoBufMut + Send,
+    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
+    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
+{
+    use tokio_epoll_uring::BoundedBuf;
+    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    while buf.bytes_total() != 0 {
+        let res;
+        (buf, res) = read_at(buf, offset).await;
+        match res {
+            Ok(0) => break,
+            Ok(n) => {
+                buf = buf.slice(n..);
+                offset += n as u64;
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
+            Err(e) => return (buf.into_inner(), Err(e)),
+        }
+    }
+    // NB: don't use `buf.is_empty()` here; it is from the
+    // `impl Deref for Slice { Target = [u8] }`; the the &[u8]
+    // returned by it only covers the initialized portion of `buf`.
+    // Whereas we're interested in ensuring that we filled the entire
+    // buffer that the user passed in.
+    if buf.bytes_total() != 0 {
+        (
+            buf.into_inner(),
+            Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                "failed to fill whole buffer",
+            )),
+        )
+    } else {
+        assert_eq!(buf.len(), buf.bytes_total());
+        (buf.into_inner(), Ok(()))
+    }
 }
 
-impl<'a> AsRef<File> for FileGuard<'a> {
-    fn as_ref(&self) -> &File {
+#[cfg(test)]
+mod test_read_exact_at_impl {
+
+    use std::{collections::VecDeque, sync::Arc};
+
+    use tokio_epoll_uring::{BoundedBuf, BoundedBufMut};
+
+    use super::read_exact_at_impl;
+
+    struct Expectation {
+        offset: u64,
+        bytes_total: usize,
+        result: std::io::Result<Vec<u8>>,
+    }
+    struct MockReadAt {
+        expectations: VecDeque<Expectation>,
+    }
+
+    impl MockReadAt {
+        async fn read_at(
+            &mut self,
+            mut buf: tokio_epoll_uring::Slice<Vec<u8>>,
+            offset: u64,
+        ) -> (tokio_epoll_uring::Slice<Vec<u8>>, std::io::Result<usize>) {
+            let exp = self
+                .expectations
+                .pop_front()
+                .expect("read_at called but we have no expectations left");
+            assert_eq!(exp.offset, offset);
+            assert_eq!(exp.bytes_total, buf.bytes_total());
+            match exp.result {
+                Ok(bytes) => {
+                    assert!(bytes.len() <= buf.bytes_total());
+                    buf.put_slice(&bytes);
+                    (buf, Ok(bytes.len()))
+                }
+                Err(e) => (buf, Err(e)),
+            }
+        }
+    }
+
+    impl Drop for MockReadAt {
+        fn drop(&mut self) {
+            assert_eq!(self.expectations.len(), 0);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_basic() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 5,
+                result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
+            }]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
+    }
+
+    #[tokio::test]
+    async fn test_empty_buf_issues_no_syscall() {
+        let buf = Vec::new();
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::new(),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_two_read_at_calls_needed_until_buf_filled() {
+        let buf = Vec::with_capacity(4);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 4,
+                    result: Ok(vec![b'a', b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 2,
+                    result: Ok(vec![b'c', b'd']),
+                },
+            ]),
+        }));
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c', b'd']);
+    }
+
+    #[tokio::test]
+    async fn test_eof_before_buffer_full() {
+        let buf = Vec::with_capacity(3);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![
+                Expectation {
+                    offset: 0,
+                    bytes_total: 3,
+                    result: Ok(vec![b'a']),
+                },
+                Expectation {
+                    offset: 1,
+                    bytes_total: 2,
+                    result: Ok(vec![b'b']),
+                },
+                Expectation {
+                    offset: 2,
+                    bytes_total: 1,
+                    result: Ok(vec![]),
+                },
+            ]),
+        }));
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        let Err(err) = res else {
+            panic!("should return an error");
+        };
+        assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof);
+        assert_eq!(format!("{err}"), "failed to fill whole buffer");
+        // buffer contents on error are unspecified
+    }
+}
+
+struct FileGuard {
+    slot_guard: RwLockReadGuard<'static, SlotInner>,
+}
+
+impl AsRef<OwnedFd> for FileGuard {
+    fn as_ref(&self) -> &OwnedFd {
         // This unwrap is safe because we only create `FileGuard`s
         // if we know that the file is Some.
         self.slot_guard.file.as_ref().unwrap()
     }
 }
 
+impl FileGuard {
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file<F, R>(&self, with: F) -> R
+    where
+        F: FnOnce(&File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut`
+        let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&file);
+        let _ = file.into_raw_fd();
+        res
+    }
+    /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually.
+    fn with_std_file_mut<F, R>(&mut self, with: F) -> R
+    where
+        F: FnOnce(&mut File) -> R,
+    {
+        // SAFETY:
+        // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`.
+        // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd
+        let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) };
+        let res = with(&mut file);
+        let _ = file.into_raw_fd();
+        res
+    }
+}
+
+impl tokio_epoll_uring::IoFd for FileGuard {
+    unsafe fn as_fd(&self) -> RawFd {
+        let owned_fd: &OwnedFd = self.as_ref();
+        owned_fd.as_raw_fd()
+    }
+}
+
 #[cfg(test)]
 impl VirtualFile {
     pub(crate) async fn read_blk(
@@ -624,16 +906,19 @@ impl VirtualFile {
         blknum: u32,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
-        let mut buf = [0; PAGE_SZ];
-        self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64))
+        let buf = vec![0; PAGE_SZ];
+        let buf = self
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
             .await?;
-        Ok(std::sync::Arc::new(buf).into())
+        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
     }
 
     async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+        let mut tmp = vec![0; 128];
         loop {
-            let mut tmp = [0; 128];
-            match self.read_at(&mut tmp, self.pos).await {
+            let res;
+            (tmp, res) = self.read_at(tmp, self.pos).await;
+            match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
                     self.pos += n as u64;
@@ -709,10 +994,12 @@ impl OpenFiles {
 /// Initialize the virtual file module. This must be called once at page
 /// server startup.
 ///
-pub fn init(num_slots: usize) {
+#[cfg(not(test))]
+pub fn init(num_slots: usize, engine: IoEngineKind) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
 
@@ -757,10 +1044,10 @@ mod tests {
     }
 
     impl MaybeVirtualFile {
-        async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
+        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
+                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
         async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
@@ -802,14 +1089,14 @@ mod tests {
 
         // Helper function to slurp a portion of a file into a string
         async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
-            let mut buf = vec![0; len];
-            self.read_exact_at(&mut buf, pos).await?;
+            let buf = vec![0; len];
+            let buf = self.read_exact_at(buf, pos).await?;
             Ok(String::from_utf8(buf).unwrap())
         }
     }
 
     #[tokio::test]
-    async fn test_virtual_files() -> Result<(), Error> {
+    async fn test_virtual_files() -> anyhow::Result<()> {
         // The real work is done in the test_files() helper function. This
         // allows us to run the same set of tests against a native File, and
         // VirtualFile. We trust the native Files and wouldn't need to test them,
@@ -825,14 +1112,17 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_physical_files() -> Result<(), Error> {
+    async fn test_physical_files() -> anyhow::Result<()> {
         test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File(open_options.open(path)?))
+            Ok(MaybeVirtualFile::File({
+                let owned_fd = open_options.open(path.as_std_path()).await?;
+                File::from(owned_fd)
+            }))
         })
         .await
     }
 
-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
+    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
     where
         OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
         FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
@@ -976,11 +1266,11 @@ mod tests {
         for _threadno in 0..THREADS {
             let files = files.clone();
             let hdl = rt.spawn(async move {
-                let mut buf = [0u8; SIZE];
+                let mut buf = vec![0u8; SIZE];
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    f.read_exact_at(&mut buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0).await.unwrap();
                     assert!(buf == SAMPLE);
                 }
             });
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
new file mode 100644
index 0000000000..f7b46fe653
--- /dev/null
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -0,0 +1,114 @@
+//! [`super::VirtualFile`] supports different IO engines.
+//!
+//! The [`IoEngineKind`] enum identifies them.
+//!
+//! The choice of IO engine is global.
+//! Initialize using [`init`].
+//!
+//! Then use [`get`] and  [`super::OpenOptions`].
+
+#[derive(
+    Copy,
+    Clone,
+    PartialEq,
+    Eq,
+    Hash,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+    Debug,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum IoEngineKind {
+    StdFs,
+    #[cfg(target_os = "linux")]
+    TokioEpollUring,
+}
+
+static IO_ENGINE: once_cell::sync::OnceCell<IoEngineKind> = once_cell::sync::OnceCell::new();
+
+#[cfg(not(test))]
+pub(super) fn init(engine: IoEngineKind) {
+    if IO_ENGINE.set(engine).is_err() {
+        panic!("called twice");
+    }
+    crate::metrics::virtual_file_io_engine::KIND
+        .with_label_values(&[&format!("{engine}")])
+        .set(1);
+}
+
+pub(super) fn get() -> &'static IoEngineKind {
+    #[cfg(test)]
+    {
+        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE";
+        IO_ENGINE.get_or_init(|| match std::env::var(env_var_name) {
+            Ok(v) => match v.parse::<IoEngineKind>() {
+                Ok(engine_kind) => engine_kind,
+                Err(e) => {
+                    panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}")
+                }
+            },
+            Err(std::env::VarError::NotPresent) => {
+                crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE
+                    .parse()
+                    .unwrap()
+            }
+            Err(std::env::VarError::NotUnicode(_)) => {
+                panic!("env var {env_var_name} is not unicode");
+            }
+        })
+    }
+    #[cfg(not(test))]
+    IO_ENGINE.get().unwrap()
+}
+
+use std::os::unix::prelude::FileExt;
+
+use super::FileGuard;
+
+impl IoEngineKind {
+    pub(super) async fn read_at<B>(
+        &self,
+        file_guard: FileGuard,
+        offset: u64,
+        mut buf: B,
+    ) -> ((FileGuard, B), std::io::Result<usize>)
+    where
+        B: tokio_epoll_uring::BoundedBufMut + Send,
+    {
+        match self {
+            IoEngineKind::StdFs => {
+                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
+                let dst = unsafe {
+                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
+                };
+                let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
+                if let Ok(nbytes) = &res {
+                    assert!(*nbytes <= buf.bytes_total());
+                    // SAFETY: see above assertion
+                    unsafe {
+                        buf.set_init(*nbytes);
+                    }
+                }
+                #[allow(dropping_references)]
+                drop(dst);
+                ((file_guard, buf), res)
+            }
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                let (resources, res) = system.read(file_guard, offset, buf).await;
+                (
+                    resources,
+                    res.map_err(|e| match e {
+                        tokio_epoll_uring::Error::Op(e) => e,
+                        tokio_epoll_uring::Error::System(system) => {
+                            std::io::Error::new(std::io::ErrorKind::Other, system)
+                        }
+                    }),
+                )
+            }
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs
new file mode 100644
index 0000000000..1e5ffe15cc
--- /dev/null
+++ b/pageserver/src/virtual_file/open_options.rs
@@ -0,0 +1,138 @@
+//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`];
+
+use super::IoEngineKind;
+use std::{os::fd::OwnedFd, path::Path};
+
+#[derive(Debug, Clone)]
+pub enum OpenOptions {
+    StdFs(std::fs::OpenOptions),
+    #[cfg(target_os = "linux")]
+    TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions),
+}
+
+impl Default for OpenOptions {
+    fn default() -> Self {
+        match super::io_engine::get() {
+            IoEngineKind::StdFs => Self::StdFs(std::fs::OpenOptions::new()),
+            #[cfg(target_os = "linux")]
+            IoEngineKind::TokioEpollUring => {
+                Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new())
+            }
+        }
+    }
+}
+
+impl OpenOptions {
+    pub fn new() -> OpenOptions {
+        Self::default()
+    }
+
+    pub fn read(&mut self, read: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.read(read);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.read(read);
+            }
+        }
+        self
+    }
+
+    pub fn write(&mut self, write: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.write(write);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.write(write);
+            }
+        }
+        self
+    }
+
+    pub fn create(&mut self, create: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create(create);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create(create);
+            }
+        }
+        self
+    }
+
+    pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.create_new(create_new);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.create_new(create_new);
+            }
+        }
+        self
+    }
+
+    pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.truncate(truncate);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.truncate(truncate);
+            }
+        }
+        self
+    }
+
+    pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result<OwnedFd> {
+        match self {
+            OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let system = tokio_epoll_uring::thread_local_system().await;
+                system.open(path, x).await.map_err(|e| match e {
+                    tokio_epoll_uring::Error::Op(e) => e,
+                    tokio_epoll_uring::Error::System(system) => {
+                        std::io::Error::new(std::io::ErrorKind::Other, system)
+                    }
+                })
+            }
+        }
+    }
+}
+
+impl std::os::unix::prelude::OpenOptionsExt for OpenOptions {
+    fn mode(&mut self, mode: u32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.mode(mode);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.mode(mode);
+            }
+        }
+        self
+    }
+
+    fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions {
+        match self {
+            OpenOptions::StdFs(x) => {
+                let _ = x.custom_flags(flags);
+            }
+            #[cfg(target_os = "linux")]
+            OpenOptions::TokioEpollUring(x) => {
+                let _ = x.custom_flags(flags);
+            }
+        }
+        self
+    }
+}
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index f2c35436db..93d1dcab35 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -33,11 +33,12 @@ use utils::failpoint_support;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
-use crate::pgdatadir_mapping::*;
+use crate::pgdatadir_mapping::{DatadirModification, Version};
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -1032,7 +1033,23 @@ impl WalIngest {
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
             for blknum in 0..nblocks {
-                debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
+                // Sharding:
+                //  - src and dst are always on the same shard, because they differ only by dbNode, and
+                //    dbNode is not included in the hash inputs for sharding.
+                //  - This WAL command is replayed on all shards, but each shard only copies the blocks
+                //    that belong to it.
+                let src_key = rel_block_to_key(src_rel, blknum);
+                if !self.shard.is_key_local(&src_key) {
+                    debug!(
+                        "Skipping non-local key {} during XLOG_DBASE_CREATE",
+                        src_key
+                    );
+                    continue;
+                }
+                debug!(
+                    "copying block {} from {} ({}) to {}",
+                    blknum, src_rel, src_key, dst_rel
+                );
 
                 let content = modification
                     .tline
@@ -1346,16 +1363,22 @@ impl WalIngest {
             self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
             self.checkpoint_modified = true;
         }
-        let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-            if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-                mbr.xid
+        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
+            if let Some(max_xid) = acc {
+                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
+                    Some(mbr.xid)
+                } else {
+                    acc
+                }
             } else {
-                acc
+                Some(mbr.xid)
             }
         });
 
-        if self.checkpoint.update_next_xid(max_mbr_xid) {
-            self.checkpoint_modified = true;
+        if let Some(max_xid) = max_mbr_xid {
+            if self.checkpoint.update_next_xid(max_xid) {
+                self.checkpoint_modified = true;
+            }
         }
         Ok(())
     }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index b4aadb2a8c..cfb8052cf1 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -47,11 +47,10 @@ use crate::metrics::{
     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
     WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
-use crate::pgdatadir_mapping::key_to_slru_block;
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 
-use pageserver_api::key::key_to_rel_block;
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -837,9 +836,8 @@ impl WalRedoProcess {
         let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
         let mut nwrite = 0usize;
 
-        let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)];
-
         while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
             let n = loop {
                 match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
                     Err(nix::errno::Errno::EINTR) => continue,
@@ -878,7 +876,6 @@ impl WalRedoProcess {
         // advancing processed responses number.
 
         let mut output = self.stdout.lock().unwrap();
-        let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)];
         let n_processed_responses = output.n_processed_responses;
         while n_processed_responses + output.pending_responses.len() <= request_no {
             // We expect the WAL redo process to respond with an 8k page image. We read it
@@ -886,6 +883,7 @@ impl WalRedoProcess {
             let mut resultbuf = vec![0; BLCKSZ.into()];
             let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
             while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
                 // We do two things simultaneously: reading response from stdout
                 // and forward any logging information that the child writes to its stderr to the page server's log.
                 let n = loop {
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index e467a9c43a..f6f006cba4 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -637,7 +637,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	ListCell   *option;
 	const char *role_name = stmt->role->rolename;
 
-	if (RoleIsNeonSuperuser(role_name))
+	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");
 
 	foreach(option, stmt->options)
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3a7c0f1bb6..0eb1acbfb0 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "access/xlog.h"
+#include "common/hashfn.h"
 #include "fmgr.h"
 #include "libpq-fe.h"
 #include "libpq/libpq.h"
@@ -38,17 +39,6 @@
 #define MIN_RECONNECT_INTERVAL_USEC 1000
 #define MAX_RECONNECT_INTERVAL_USEC 1000000
 
-bool		connected = false;
-PGconn	   *pageserver_conn = NULL;
-
-/*
- * WaitEventSet containing:
- * - WL_SOCKET_READABLE on pageserver_conn,
- * - WL_LATCH_SET on MyLatch, and
- * - WL_EXIT_ON_PM_DEATH.
- */
-WaitEventSet *pageserver_conn_wes = NULL;
-
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
@@ -59,16 +49,40 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-static int n_reconnect_attempts = 0;
-static int max_reconnect_attempts = 60;
-
-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+static int	n_reconnect_attempts = 0;
+static int	max_reconnect_attempts = 60;
+static int	stripe_size;
 
 typedef struct
 {
-	LWLockId	lock;
-	pg_atomic_uint64 update_counter;
-	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
+	size_t		num_shards;
+} ShardMap;
+
+/*
+ * PagestoreShmemState is kept in shared memory. It contains the connection
+ * strings for each shard.
+ *
+ * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
+ * allowing it to be changed using pg_reload_conf(). The control plane can
+ * update the connection string if the pageserver crashes, is relocated, or
+ * new shards are added. A parsed copy of the current value of the GUC is kept
+ * in shared memory, updated by the postmaster, because regular backends don't
+ * reload the config during query execution, but we might need to re-establish
+ * the pageserver connection with the new connection string even in the middle
+ * of a query.
+ *
+ * The shared memory copy is protected by a lockless algorithm using two
+ * atomic counters. The counters allow a backend to quickly check if the value
+ * has changed since last access, and to detect and retry copying the value if
+ * the postmaster changes the value concurrently. (Postmaster doesn't have a
+ * PGPROC entry and therefore cannot use LWLocks.)
+ */
+typedef struct
+{
+	pg_atomic_uint64 begin_update_counter;
+	pg_atomic_uint64 end_update_counter;
+	ShardMap	shard_map;
 } PagestoreShmemState;
 
 #if PG_VERSION_NUM >= 150000
@@ -78,76 +92,242 @@ static void walproposer_shmem_request(void);
 static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
+/* This backend's per-shard connections */
+typedef struct
+{
+	PGconn	   *conn;
+
+	/*---
+	 * WaitEventSet containing:
+	 * - WL_SOCKET_READABLE on 'conn'
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet *wes;
+} PageServer;
+
+static PageServer page_servers[MAX_SHARDS];
+
+static bool pageserver_flush(shardno_t shard_no);
+static void pageserver_disconnect(shardno_t shard_no);
 
 static bool
-PagestoreShmemIsValid()
+PagestoreShmemIsValid(void)
 {
 	return pagestore_shared && UsedShmemSegAddr;
 }
 
+/*
+ * Parse a comma-separated list of connection strings into a ShardMap.
+ *
+ * If 'result' is NULL, just checks that the input is valid. If the input is
+ * not valid, returns false. The contents of *result are undefined in
+ * that case, and must not be relied on.
+ */
+static bool
+ParseShardMap(const char *connstr, ShardMap *result)
+{
+	const char *p;
+	int			nshards = 0;
+
+	if (result)
+		memset(result, 0, sizeof(ShardMap));
+
+	p = connstr;
+	nshards = 0;
+	for (;;)
+	{
+		const char *sep;
+		size_t		connstr_len;
+
+		sep = strchr(p, ',');
+		connstr_len = sep != NULL ? sep - p : strlen(p);
+
+		if (connstr_len == 0 && sep == NULL)
+			break;				/* ignore trailing comma */
+
+		if (nshards >= MAX_SHARDS)
+		{
+			neon_log(LOG, "Too many shards");
+			return false;
+		}
+		if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE)
+		{
+			neon_log(LOG, "Connection string too long");
+			return false;
+		}
+		if (result)
+		{
+			memcpy(result->connstring[nshards], p, connstr_len);
+			result->connstring[nshards][connstr_len] = '\0';
+		}
+		nshards++;
+
+		if (sep == NULL)
+			break;
+		p = sep + 1;
+	}
+	if (result)
+		result->num_shards = nshards;
+
+	return true;
+}
+
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
-	return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+	char	   *p = *newval;
+
+	return ParseShardMap(p, NULL);
 }
 
 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (!PagestoreShmemIsValid())
+	ShardMap	shard_map;
+
+	/*
+	 * Only postmaster updates the copy in shared memory.
+	 */
+	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
 		return;
-	LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-	LWLockRelease(pagestore_shared->lock);
-}
-
-static bool
-CheckConnstringUpdated()
-{
-	if (!PagestoreShmemIsValid())
-		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+
+	if (!ParseShardMap(newval, &shard_map))
+	{
+		/*
+		 * shouldn't happen, because we already checked the value in
+		 * CheckPageserverConnstring
+		 */
+		elog(ERROR, "could not parse shard map");
+	}
+
+	if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0)
+	{
+		pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
+		pg_write_barrier();
+		memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap));
+		pg_write_barrier();
+		pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
+	}
+	else
+	{
+		/* no change */
+	}
 }
 
+/*
+ * Get the current number of shards, and/or the connection string for a
+ * particular shard from the shard map in shared memory.
+ *
+ * If num_shards_p is not NULL, it is set to the current number of shards.
+ *
+ * If connstr_p is not NULL, the connection string for 'shard_no' is copied to
+ * it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes
+ * long.
+ *
+ * As a side-effect, if the shard map in shared memory had changed since the
+ * last call, terminates all existing connections to all pageservers.
+ */
 static void
-ReloadConnstring()
+load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 {
-	if (!PagestoreShmemIsValid())
-		return;
-	LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-	strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-	pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-	LWLockRelease(pagestore_shared->lock);
+	uint64		begin_update_counter;
+	uint64		end_update_counter;
+	ShardMap   *shard_map = &pagestore_shared->shard_map;
+	shardno_t	num_shards;
+
+	/*
+	 * Postmaster can update the shared memory values concurrently, in which
+	 * case we would copy a garbled mix of the old and new values. We will
+	 * detect it because the counter's won't match, and retry. But it's
+	 * important that we don't do anything within the retry-loop that would
+	 * depend on the string having valid contents.
+	 */
+	do
+	{
+		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
+
+		num_shards = shard_map->num_shards;
+		if (connstr_p && shard_no < MAX_SHARDS)
+			strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
+		pg_memory_barrier();
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
+
+	if (connstr_p && shard_no >= num_shards)
+		neon_log(ERROR, "Shard %d is greater or equal than number of shards %d",
+				 shard_no, num_shards);
+
+	/*
+	 * If any of the connection strings changed, reset all connections.
+	 */
+	if (pagestore_local_counter != end_update_counter)
+	{
+		for (shardno_t i = 0; i < MAX_SHARDS; i++)
+		{
+			if (page_servers[i].conn)
+				pageserver_disconnect(i);
+		}
+		pagestore_local_counter = end_update_counter;
+	}
+
+	if (num_shards_p)
+		*num_shards_p = num_shards;
+}
+
+#define MB (1024*1024)
+
+shardno_t
+get_shard_number(BufferTag *tag)
+{
+	shardno_t	n_shards;
+	uint32		hash;
+
+	load_shard_map(0, NULL, &n_shards);
+
+#if PG_MAJORVERSION_NUM < 16
+	hash = murmurhash32(tag->rnode.relNode);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#else
+	hash = murmurhash32(tag->relNumber);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#endif
+
+	return hash % n_shards;
 }
 
 static bool
-pageserver_connect(int elevel)
+pageserver_connect(shardno_t shard_no, int elevel)
 {
 	char	   *query;
 	int			ret;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
+	PGconn	   *conn;
+	WaitEventSet *wes;
+	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];
 
 	static TimestampTz last_connect_time = 0;
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
-        uint64_t us_since_last_connect;
+	uint64_t	us_since_last_connect;
 
-	Assert(!connected);
+	Assert(page_servers[shard_no].conn == NULL);
 
-	if (CheckConnstringUpdated())
-	{
-		ReloadConnstring();
-	}
+	/*
+	 * Get the connection string for this shard. If the shard map has been
+	 * updated since we last looked, this will also disconnect any existing
+	 * pageserver connections as a side effect.
+	 */
+	load_shard_map(shard_no, connstr, NULL);
 
 	now = GetCurrentTimestamp();
-        us_since_last_connect = now - last_connect_time;
+	us_since_last_connect = now - last_connect_time;
 	if (us_since_last_connect < delay_us)
 	{
 		pg_usleep(delay_us - us_since_last_connect);
@@ -180,76 +360,84 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = connstr;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
 	n++;
-	pageserver_conn = PQconnectdbParams(keywords, values, 1);
+	conn = PQconnectdbParams(keywords, values, 1);
 
-	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (PQstatus(conn) == CONNECTION_BAD)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		char	   *msg = pchomp(PQerrorMessage(conn));
 
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
+		PQfinish(conn);
 
 		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "could not establish connection to pageserver"),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
 				 errdetail_internal("%s", msg)));
+		pfree(msg);
 		return false;
 	}
-
 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-	ret = PQsendQuery(pageserver_conn, query);
+	ret = PQsendQuery(conn, query);
+	pfree(query);
 	if (ret != 1)
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		neon_log(elevel, "could not send pagestream command to pageserver");
+		PQfinish(conn);
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
 		return false;
 	}
 
-	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
 
-	while (PQisBusy(pageserver_conn))
+	PG_TRY();
 	{
-		WaitEvent	event;
-
-		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-
-		CHECK_FOR_INTERRUPTS();
-
-		/* Data available in socket? */
-		if (event.events & WL_SOCKET_READABLE)
+		while (PQisBusy(conn))
 		{
-			if (!PQconsumeInput(pageserver_conn))
+			WaitEvent	event;
+
+			/* Sleep until there's something to do */
+			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			ResetLatch(MyLatch);
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* Data available in socket? */
+			if (event.events & WL_SOCKET_READABLE)
 			{
-				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				if (!PQconsumeInput(conn))
+				{
+					char	   *msg = pchomp(PQerrorMessage(conn));
 
-				PQfinish(pageserver_conn);
-				pageserver_conn = NULL;
-				FreeWaitEventSet(pageserver_conn_wes);
-				pageserver_conn_wes = NULL;
+					PQfinish(conn);
+					FreeWaitEventSet(wes);
 
-				neon_log(elevel, "could not complete handshake with pageserver: %s",
-						 msg);
-				return false;
+					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
+								   msg);
+					return false;
+				}
 			}
 		}
 	}
+	PG_CATCH();
+	{
+		PQfinish(conn);
+		FreeWaitEventSet(wes);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 
-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;
 
-	connected = true;
 	return true;
 }
 
@@ -257,9 +445,10 @@ pageserver_connect(int elevel)
  * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
  */
 static int
-call_PQgetCopyData(char **buffer)
+call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -269,7 +458,7 @@ retry:
 		WaitEvent	event;
 
 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);
 
 		CHECK_FOR_INTERRUPTS();
@@ -281,7 +470,7 @@ retry:
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
 			}
@@ -295,7 +484,7 @@ retry:
 
 
 static void
-pageserver_disconnect(void)
+pageserver_disconnect(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -304,38 +493,38 @@ pageserver_disconnect(void)
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
 	 */
-	if (connected)
+	if (page_servers[shard_no].conn)
 	{
-		neon_log(LOG, "dropping connection to page server due to error");
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;
 
+		/*
+		 * If the connection to any pageserver is lost, we throw away the
+		 * whole prefetch queue, even for other pageservers. It should not
+		 * cause big problems, because connection loss is supposed to be a
+		 * rare event.
+		 */
 		prefetch_on_ps_disconnect();
 	}
-	if (pageserver_conn_wes != NULL)
+	if (page_servers[shard_no].wes != NULL)
 	{
-		FreeWaitEventSet(pageserver_conn_wes);
-		pageserver_conn_wes = NULL;
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
 	}
 }
 
 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-
-	if (CheckConnstringUpdated())
-	{
-		pageserver_disconnect();
-		ReloadConnstring();
-	}
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
 	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
+		pageserver_disconnect(shard_no);
 	}
 
 	req_buff = nm_pack_request(request);
@@ -349,9 +538,9 @@ pageserver_send(NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
@@ -359,6 +548,8 @@ pageserver_send(NeonRequest *request)
 		n_reconnect_attempts = 0;
 	}
 
+	pageserver_conn = page_servers[shard_no].conn;
+
 	/*
 	 * Send request.
 	 *
@@ -371,8 +562,8 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-		pageserver_disconnect();
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -384,19 +575,20 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);
 
-		neon_log(PageStoreTrace, "sent request: %s", msg);
+		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
 	return true;
 }
 
 static NeonResponse *
-pageserver_receive(void)
+pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
 
-	if (!connected)
+	if (!pageserver_conn)
 		return NULL;
 
 	PG_TRY();
@@ -404,7 +596,7 @@ pageserver_receive(void)
 		/* read response */
 		int			rc;
 
-		rc = call_PQgetCopyData(&resp_buff.data);
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
 		if (rc >= 0)
 		{
 			resp_buff.len = rc;
@@ -416,33 +608,33 @@ pageserver_receive(void)
 			{
 				char	   *msg = nm_to_string((NeonMessage *) resp);
 
-				neon_log(PageStoreTrace, "got response: %s", msg);
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
 				pfree(msg);
 			}
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
-			pageserver_disconnect();
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			pageserver_disconnect(shard_no);
 			resp = NULL;
 		}
 		else if (rc == -2)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
 		else
 		{
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		pageserver_disconnect(shard_no);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -452,11 +644,13 @@ pageserver_receive(void)
 
 
 static bool
-pageserver_flush(void)
+pageserver_flush(shardno_t shard_no)
 {
-	if (!connected)
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+
+	if (!pageserver_conn)
 	{
-		neon_log(WARNING, "Tried to flush while disconnected");
+		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
 	else
 	{
@@ -464,8 +658,8 @@ pageserver_flush(void)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
-			pageserver_disconnect();
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
 			return false;
 		}
@@ -505,8 +699,9 @@ PagestoreShmemInit(void)
 									   &found);
 	if (!found)
 	{
-		pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-		pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
+		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -531,7 +726,6 @@ pagestore_shmem_request(void)
 #endif
 
 	RequestAddinShmemSpace(PagestoreShmemSize());
-	RequestNamedLWLockTranche("neon_libpagestore", 1);
 }
 
 static void
@@ -582,6 +776,15 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);
 
+	DefineCustomIntVariable("neon.stripe_size",
+							"sharding stripe size",
+							NULL,
+							&stripe_size,
+							32768, 1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_BLOCKS,
+							NULL, NULL, NULL);
+
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 3fcaab0bee..8c02f357bc 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -20,9 +20,13 @@
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 #include "storage/block.h"
+#include "storage/buf_internals.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 
+#define MAX_SHARDS 128
+#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -51,6 +55,9 @@ typedef struct
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
 										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
 
 /*
  * supertype of all the Neon*Request structs below
@@ -141,11 +148,13 @@ extern char *nm_to_string(NeonMessage *msg);
  * API
  */
 
+typedef unsigned shardno_t;
+
 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
-	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	NeonResponse *(*receive) (shardno_t shard_no);
+	bool		(*flush) (shardno_t shard_no);
 } page_server_api;
 
 extern void prefetch_on_ps_disconnect(void);
@@ -159,6 +168,8 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 
+extern shardno_t get_shard_number(BufferTag* tag);
+
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 0db093e5a7..1fa802e6f4 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -172,6 +172,7 @@ typedef struct PrefetchRequest
 	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
+	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;
 
@@ -239,10 +240,17 @@ typedef struct PrefetchState
 								 * also unused */
 
 	/* the buffers */
-	prfh_hash  *prf_hash;
+	prfh_hash	*prf_hash;
+	int			max_shard_no;
+	/* Mark shards involved in prefetch */
+	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
+#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
+#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
+#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
+
 static PrefetchState *MyPState;
 
 #define GetPrfSlot(ring_index) ( \
@@ -327,6 +335,7 @@ compact_prefetch_buffers(void)
 		Assert(target_slot->status == PRFS_UNUSED);
 
 		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
 		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -494,6 +503,23 @@ prefetch_cleanup_trailing_unused(void)
 	}
 }
 
+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
 /*
  * Wait for slot of ring_index to have received its response.
  * The caller is responsible for making sure the request buffer is flushed.
@@ -509,7 +535,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
@@ -547,7 +573,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->my_ring_index == MyPState->ring_receive);
 
 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive();
+	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
@@ -704,12 +730,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
 
-	while (!page_server->send((NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
 
 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
 	MyPState->n_unused -= 1;
 	MyPState->ring_unused += 1;
+	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
 
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
@@ -880,6 +908,7 @@ Retry:
 	 * function reads the buffer tag from the slot.
 	 */
 	slot->buftag = tag;
+	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
 	prefetch_do_request(slot, force_latest, force_lsn);
@@ -890,7 +919,7 @@ Retry:
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 		{
 			/*
 			 * Prefetch set is reset in case of error, so we should try to
@@ -908,13 +937,44 @@ static NeonResponse *
 page_server_request(void const *req)
 {
 	NeonResponse *resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (((NeonRequest *) req)->tag)
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+	}
+	shard_no = get_shard_number(&tag);
+
+
+	/*
+	 * Current sharding model assumes that all metadata is present only at shard 0.
+	 * We still need to call get_shard_no() to check if shard map is up-to-date.
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	{
+		shard_no = 0;
+	}
 
 	do
 	{
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
-		MyPState->ring_flush = MyPState->ring_unused;
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
 		consume_prefetch_responses();
-		resp = page_server->receive();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 	return resp;
 
@@ -2098,8 +2158,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							blkno,
+					 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 0707c1331f..8d1b861a66 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -4,7 +4,9 @@ pub mod backend;
 pub use backend::BackendType;
 
 mod credentials;
-pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint};
+pub use credentials::{
+    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
+};
 
 mod password_hack;
 pub use password_hack::parse_endpoint_param;
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 34171d4d3f..b1634906c9 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,7 +3,6 @@ mod hacks;
 mod link;
 
 pub use link::LinkAuthError;
-use smol_str::SmolStr;
 use tokio_postgres::config::AuthKeys;
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
@@ -16,7 +15,6 @@ use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
 use crate::proxy::NeonOptions;
-use crate::scram;
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -28,6 +26,7 @@ use crate::{
     },
     stream, url,
 };
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use futures::TryFutureExt;
 use std::borrow::Cow;
 use std::ops::ControlFlow;
@@ -35,6 +34,8 @@ use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};
 
+use super::IpPattern;
+
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -55,7 +56,7 @@ pub enum BackendType<'a, T> {
 
 pub trait TestBackend: Send + Sync + 'static {
     fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
-    fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError>;
+    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError>;
 }
 
 impl std::fmt::Display for BackendType<'_, ()> {
@@ -128,19 +129,19 @@ pub struct ComputeCredentials<T> {
 
 #[derive(Debug, Clone)]
 pub struct ComputeUserInfoNoEndpoint {
-    pub user: SmolStr,
+    pub user: RoleName,
     pub options: NeonOptions,
 }
 
 #[derive(Debug, Clone)]
 pub struct ComputeUserInfo {
-    pub endpoint: SmolStr,
-    pub user: SmolStr,
+    pub endpoint: EndpointId,
+    pub user: RoleName,
     pub options: NeonOptions,
 }
 
 impl ComputeUserInfo {
-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
         self.options.get_cache_key(&self.endpoint)
     }
 }
@@ -156,7 +157,7 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
     type Error = ComputeUserInfoNoEndpoint;
 
     fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
-        match user_info.project {
+        match user_info.endpoint_id {
             None => Err(ComputeUserInfoNoEndpoint {
                 user: user_info.user,
                 options: user_info.options,
@@ -202,21 +203,18 @@ async fn auth_quirks(
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let maybe_secret = api.get_role_secret(ctx, &info).await?;
+    let cached_secret = api.get_role_secret(ctx, &info).await?;
 
-    let cached_secret = maybe_secret.unwrap_or_else(|| {
+    let secret = cached_secret.value.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
         info!("authentication info not found, mocking it");
-        Cached::new_uncached(AuthSecret::Scram(scram::ServerSecret::mock(
-            &info.user,
-            rand::random(),
-        )))
+        AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
     });
     match authenticate_with_secret(
         ctx,
-        cached_secret.value.clone(),
+        secret,
         info,
         client,
         unauthenticated_password,
@@ -318,11 +316,11 @@ async fn auth_and_wake_compute(
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<SmolStr> {
+    pub fn get_endpoint(&self) -> Option<EndpointId> {
         use BackendType::*;
 
         match self {
-            Console(_, user_info) => user_info.project.clone(),
+            Console(_, user_info) => user_info.endpoint_id.clone(),
             Link(_) => Some("link".into()),
             #[cfg(test)]
             Test(_) => Some("test".into()),
@@ -356,7 +354,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
             Console(api, user_info) => {
                 info!(
                     user = &*user_info.user,
-                    project = user_info.project(),
+                    project = user_info.endpoint(),
                     "performing authentication using the console"
                 );
 
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index ada7f3614c..5bf7667a1f 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,12 +2,13 @@
 
 use crate::{
     auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
-use std::{collections::HashSet, net::IpAddr};
+use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
 
@@ -21,7 +22,10 @@ pub enum ComputeUserInfoParseError {
          SNI ('{}') and project option ('{}').",
         .domain, .option,
     )]
-    InconsistentProjectNames { domain: SmolStr, option: SmolStr },
+    InconsistentProjectNames {
+        domain: EndpointId,
+        option: EndpointId,
+    },
 
     #[error(
         "Common name inferred from SNI ('{}') is not known",
@@ -30,7 +34,7 @@ pub enum ComputeUserInfoParseError {
     UnknownCommonName { cn: String },
 
     #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
-    MalformedProjectName(SmolStr),
+    MalformedProjectName(EndpointId),
 }
 
 impl UserFacingError for ComputeUserInfoParseError {}
@@ -39,24 +43,22 @@ impl UserFacingError for ComputeUserInfoParseError {}
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ComputeUserInfoMaybeEndpoint {
-    pub user: SmolStr,
-    // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<SmolStr>,
-
+    pub user: RoleName,
+    pub endpoint_id: Option<EndpointId>,
     pub options: NeonOptions,
 }
 
 impl ComputeUserInfoMaybeEndpoint {
     #[inline]
-    pub fn project(&self) -> Option<&str> {
-        self.project.as_deref()
+    pub fn endpoint(&self) -> Option<&str> {
+        self.endpoint_id.as_deref()
     }
 }
 
-pub fn endpoint_sni<'a>(
-    sni: &'a str,
+pub fn endpoint_sni(
+    sni: &str,
     common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
+) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
     let Some((subdomain, common_name)) = sni.split_once('.') else {
         return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
     };
@@ -65,7 +67,10 @@ pub fn endpoint_sni<'a>(
             cn: common_name.into(),
         });
     }
-    Ok(subdomain)
+    if subdomain == SERVERLESS_DRIVER_SNI {
+        return Ok(None);
+    }
+    Ok(Some(EndpointId::from(subdomain)))
 }
 
 impl ComputeUserInfoMaybeEndpoint {
@@ -79,15 +84,14 @@ impl ComputeUserInfoMaybeEndpoint {
 
         // Some parameters are stored in the startup message.
         let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user: SmolStr = get_param("user")?.into();
+        let user: RoleName = get_param("user")?.into();
 
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(SmolStr::from));
 
         // Project name might be passed via PG's command-line options.
-        let project_option = params
+        let endpoint_option = params
             .options_raw()
             .and_then(|options| {
                 // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
@@ -100,9 +104,9 @@ impl ComputeUserInfoMaybeEndpoint {
             })
             .map(|name| name.into());
 
-        let project_from_domain = if let Some(sni_str) = sni {
+        let endpoint_from_domain = if let Some(sni_str) = sni {
             if let Some(cn) = common_names {
-                Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
+                endpoint_sni(sni_str, cn)?
             } else {
                 None
             }
@@ -110,26 +114,27 @@ impl ComputeUserInfoMaybeEndpoint {
             None
         };
 
-        let project = match (project_option, project_from_domain) {
+        let endpoint = match (endpoint_option, endpoint_from_domain) {
             // Invariant: if we have both project name variants, they should match.
             (Some(option), Some(domain)) if option != domain => {
                 Some(Err(InconsistentProjectNames { domain, option }))
             }
             // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
+            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
                 false => Err(MalformedProjectName(name)),
                 true => Ok(name),
             }),
         }
         .transpose()?;
+        ctx.set_endpoint_id(endpoint.clone());
 
-        info!(%user, project = project.as_deref(), "credentials");
+        info!(%user, project = endpoint.as_deref(), "credentials");
         if sni.is_some() {
             info!("Connection with sni");
             NUM_CONNECTION_ACCEPTED_BY_SNI
                 .with_label_values(&["sni"])
                 .inc();
-        } else if project.is_some() {
+        } else if endpoint.is_some() {
             NUM_CONNECTION_ACCEPTED_BY_SNI
                 .with_label_values(&["no_sni"])
                 .inc();
@@ -145,36 +150,57 @@ impl ComputeUserInfoMaybeEndpoint {
 
         Ok(Self {
             user,
-            project,
+            endpoint_id: endpoint.map(EndpointId::from),
             options,
         })
     }
 }
 
-pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<SmolStr>) -> bool {
-    if ip_list.is_empty() {
-        return true;
-    }
-    for ip in ip_list {
-        // We expect that all ip addresses from control plane are correct.
-        // However, if some of them are broken, we still can check the others.
-        match parse_ip_pattern(ip) {
-            Ok(pattern) => {
-                if check_ip(peer_addr, &pattern) {
-                    return true;
-                }
-            }
-            Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err),
-        }
-    }
-    false
+pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
+    ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
 }
 
 #[derive(Debug, Clone, Eq, PartialEq)]
-enum IpPattern {
+pub enum IpPattern {
     Subnet(ipnet::IpNet),
     Range(IpAddr, IpAddr),
     Single(IpAddr),
+    None,
+}
+
+impl<'de> serde::de::Deserialize<'de> for IpPattern {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct StrVisitor;
+        impl<'de> serde::de::Visitor<'de> for StrVisitor {
+            type Value = IpPattern;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(parse_ip_pattern(v).unwrap_or_else(|e| {
+                    warn!("Cannot parse ip pattern {v}: {e}");
+                    IpPattern::None
+                }))
+            }
+        }
+        deserializer.deserialize_str(StrVisitor)
+    }
+}
+
+impl FromStr for IpPattern {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        parse_ip_pattern(s)
+    }
 }
 
 fn parse_ip_pattern(pattern: &str) -> anyhow::Result<IpPattern> {
@@ -196,6 +222,7 @@ fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool {
         IpPattern::Subnet(subnet) => subnet.contains(ip),
         IpPattern::Range(start, end) => start <= ip && ip <= end,
         IpPattern::Single(addr) => addr == ip,
+        IpPattern::None => false,
     }
 }
 
@@ -206,6 +233,7 @@ fn project_name_valid(name: &str) -> bool {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use serde_json::json;
     use ComputeUserInfoParseError::*;
 
     #[test]
@@ -215,7 +243,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);
 
         Ok(())
     }
@@ -230,7 +258,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);
 
         Ok(())
     }
@@ -246,7 +274,7 @@ mod tests {
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("foo"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
         assert_eq!(user_info.options.get_cache_key("foo"), "foo");
 
         Ok(())
@@ -262,7 +290,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
         Ok(())
     }
@@ -277,7 +305,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));
 
         Ok(())
     }
@@ -295,7 +323,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());
 
         Ok(())
     }
@@ -310,7 +338,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
         assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());
 
         Ok(())
     }
@@ -326,7 +354,7 @@ mod tests {
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
         assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("baz"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));
 
         Ok(())
     }
@@ -340,14 +368,14 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         let common_names = Some(["a.com".into(), "b.com".into()].into());
         let sni = Some("p1.b.com");
         let mut ctx = RequestMonitoring::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));
 
         Ok(())
     }
@@ -404,7 +432,7 @@ mod tests {
         let mut ctx = RequestMonitoring::test();
         let user_info =
             ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("project"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
         assert_eq!(
             user_info.options.get_cache_key("project"),
             "project endpoint_type:read_write lsn:0/2"
@@ -415,21 +443,17 @@ mod tests {
 
     #[test]
     fn test_check_peer_addr_is_in_list() {
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        assert!(check_peer_addr_is_in_list(&peer_addr, &vec![]));
-        assert!(check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["127.0.0.1".into()]
-        ));
-        assert!(!check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["8.8.8.8".into()]
-        ));
+        fn check(v: serde_json::Value) -> bool {
+            let peer_addr = IpAddr::from([127, 0, 0, 1]);
+            let ip_list: Vec<IpPattern> = serde_json::from_value(v).unwrap();
+            check_peer_addr_is_in_list(&peer_addr, &ip_list)
+        }
+
+        assert!(check(json!([])));
+        assert!(check(json!(["127.0.0.1"])));
+        assert!(!check(json!(["8.8.8.8"])));
         // If there is an incorrect address, it will be skipped.
-        assert!(check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["88.8.8".into(), "127.0.0.1".into()]
-        ));
+        assert!(check(json!(["88.8.8", "127.0.0.1"])));
     }
     #[test]
     fn test_parse_ip_v4() -> anyhow::Result<()> {
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 372b0764ee..2ddf46fe25 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -4,10 +4,11 @@
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.
 
 use bstr::ByteSlice;
-use smol_str::SmolStr;
+
+use crate::EndpointId;
 
 pub struct PasswordHackPayload {
-    pub endpoint: SmolStr,
+    pub endpoint: EndpointId,
     pub password: Vec<u8>,
 }
 
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 57d9e5289d..6f37868a8c 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -11,13 +11,16 @@ use smol_str::SmolStr;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
-use crate::{config::ProjectInfoCacheOptions, console::AuthSecret};
+use crate::{
+    auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
+    RoleName,
+};
 
 use super::{Cache, Cached};
 
 pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
     fn enable_ttl(&self);
     fn disable_ttl(&self);
 }
@@ -44,8 +47,8 @@ impl<T> From<T> for Entry<T> {
 
 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<SmolStr, Entry<AuthSecret>>,
-    allowed_ips: Option<Entry<Arc<Vec<SmolStr>>>>,
+    secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
+    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }
 
 impl EndpointInfo {
@@ -57,10 +60,10 @@ impl EndpointInfo {
     }
     pub fn get_role_secret(
         &self,
-        role_name: &SmolStr,
+        role_name: &RoleName,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
-    ) -> Option<(AuthSecret, bool)> {
+    ) -> Option<(Option<AuthSecret>, bool)> {
         if let Some(secret) = self.secret.get(role_name) {
             if valid_since < secret.created_at {
                 return Some((
@@ -76,7 +79,7 @@ impl EndpointInfo {
         &self,
         valid_since: Instant,
         ignore_cache_since: Option<Instant>,
-    ) -> Option<(Arc<Vec<SmolStr>>, bool)> {
+    ) -> Option<(Arc<Vec<IpPattern>>, bool)> {
         if let Some(allowed_ips) = &self.allowed_ips {
             if valid_since < allowed_ips.created_at {
                 return Some((
@@ -90,7 +93,7 @@ impl EndpointInfo {
     pub fn invalidate_allowed_ips(&mut self) {
         self.allowed_ips = None;
     }
-    pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
+    pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
         self.secret.remove(role_name);
     }
 }
@@ -103,9 +106,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<SmolStr, EndpointInfo>,
+    cache: DashMap<EndpointId, EndpointInfo>,
 
-    project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
+    project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
     config: ProjectInfoCacheOptions,
 
     start_time: Instant,
@@ -113,7 +116,7 @@ pub struct ProjectInfoCacheImpl {
 }
 
 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
         info!("invalidating allowed ips for project `{}`", project_id);
         let endpoints = self
             .project2ep
@@ -126,7 +129,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
         info!(
             "invalidating role secret for project_id `{}` and role_name `{}`",
             project_id, role_name
@@ -167,9 +170,9 @@ impl ProjectInfoCacheImpl {
 
     pub fn get_role_secret(
         &self,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
-    ) -> Option<Cached<&Self, AuthSecret>> {
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
+    ) -> Option<Cached<&Self, Option<AuthSecret>>> {
         let (valid_since, ignore_cache_since) = self.get_cache_times();
         let endpoint_info = self.cache.get(endpoint_id)?;
         let (value, ignore_cache) =
@@ -188,8 +191,8 @@ impl ProjectInfoCacheImpl {
     }
     pub fn get_allowed_ips(
         &self,
-        endpoint_id: &SmolStr,
-    ) -> Option<Cached<&Self, Arc<Vec<SmolStr>>>> {
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
         let (valid_since, ignore_cache_since) = self.get_cache_times();
         let endpoint_info = self.cache.get(endpoint_id)?;
         let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
@@ -205,10 +208,10 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_role_secret(
         &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
-        secret: AuthSecret,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
+        secret: Option<AuthSecret>,
     ) {
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
@@ -222,9 +225,9 @@ impl ProjectInfoCacheImpl {
     }
     pub fn insert_allowed_ips(
         &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        allowed_ips: Arc<Vec<SmolStr>>,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
+        allowed_ips: Arc<Vec<IpPattern>>,
     ) {
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
@@ -236,7 +239,7 @@ impl ProjectInfoCacheImpl {
             .or_default()
             .allowed_ips = Some(allowed_ips.into());
     }
-    fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
+    fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
         if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
             endpoints.insert(endpoint_id.clone());
         } else {
@@ -297,18 +300,18 @@ impl ProjectInfoCacheImpl {
 /// This is used to invalidate cache entries.
 pub struct CachedLookupInfo {
     /// Search by this key.
-    endpoint_id: SmolStr,
+    endpoint_id: EndpointId,
     lookup_type: LookupType,
 }
 
 impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
+    pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::RoleSecret(role_name),
         }
     }
-    pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
         Self {
             endpoint_id,
             lookup_type: LookupType::AllowedIps,
@@ -317,7 +320,7 @@ impl CachedLookupInfo {
 }
 
 enum LookupType {
-    RoleSecret(SmolStr),
+    RoleSecret(RoleName),
     AllowedIps,
 }
 
@@ -348,7 +351,6 @@ impl Cache for ProjectInfoCacheImpl {
 mod tests {
     use super::*;
     use crate::{console::AuthSecret, scram::ServerSecret};
-    use smol_str::SmolStr;
     use std::{sync::Arc, time::Duration};
 
     #[tokio::test]
@@ -362,11 +364,17 @@ mod tests {
         });
         let project_id = "project".into();
         let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = None;
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
         cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
@@ -379,8 +387,11 @@ mod tests {
         assert_eq!(cached.value, secret2);
 
         // Shouldn't add more than 2 roles.
-        let user3: SmolStr = "user3".into();
-        let secret3 = AuthSecret::Scram(ServerSecret::mock(user3.as_str(), [3; 32]));
+        let user3: RoleName = "user3".into();
+        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user3.as_str(),
+            [3; 32],
+        )));
         cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
         assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
 
@@ -411,11 +422,20 @@ mod tests {
 
         let project_id = "project".into();
         let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
         cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
@@ -457,11 +477,20 @@ mod tests {
 
         let project_id = "project".into();
         let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
         cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
         cache.clone().disable_ttl();
         tokio::time::advance(Duration::from_millis(100)).await;
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index c02d65668f..6ef9bcf4eb 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,7 +1,10 @@
 use serde::Deserialize;
-use smol_str::SmolStr;
 use std::fmt;
 
+use crate::auth::IpPattern;
+
+use crate::{BranchId, EndpointId, ProjectId};
+
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
@@ -14,8 +17,8 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
     pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<Box<str>>>,
-    pub project_id: Option<Box<str>>,
+    pub allowed_ips: Option<Vec<IpPattern>>,
+    pub project_id: Option<ProjectId>,
 }
 
 // Manually implement debug to omit sensitive info.
@@ -92,9 +95,9 @@ impl fmt::Debug for DatabaseInfo {
 /// Also known as `ProxyMetricsAuxInfo` in the console.
 #[derive(Debug, Deserialize, Clone, Default)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: SmolStr,
-    pub project_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub project_id: ProjectId,
+    pub branch_id: BranchId,
 }
 
 impl MetricsAuxInfo {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 178a7a2f4c..a6dfbd79db 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -4,16 +4,15 @@ pub mod neon;
 
 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::backend::ComputeUserInfo,
+    auth::{backend::ComputeUserInfo, IpPattern},
     cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
     context::RequestMonitoring,
-    scram,
+    scram, EndpointCacheKey, ProjectId,
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
-use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
@@ -212,9 +211,9 @@ pub enum AuthSecret {
 pub struct AuthInfo {
     pub secret: Option<AuthSecret>,
     /// List of IP addresses allowed for the autorization.
-    pub allowed_ips: Vec<SmolStr>,
+    pub allowed_ips: Vec<IpPattern>,
     /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<SmolStr>,
+    pub project_id: Option<ProjectId>,
 }
 
 /// Info for establishing a connection to a compute node.
@@ -233,10 +232,10 @@ pub struct NodeInfo {
     pub allow_self_signed_compute: bool,
 }
 
-pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
+pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
-pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, AuthSecret>;
-pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<SmolStr>>>;
+pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
+pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -249,7 +248,7 @@ pub trait Api {
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError>;
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
@@ -280,7 +279,7 @@ impl Api for ConsoleBackend {
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         use ConsoleBackend::*;
         match self {
             Console(api) => api.get_role_secret(ctx, user_info).await,
@@ -345,7 +344,7 @@ impl ApiCaches {
 /// Various caches for [`console`](super).
 pub struct ApiLocks {
     name: &'static str,
-    node_locks: DashMap<SmolStr, Arc<Semaphore>>,
+    node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
     permits: usize,
     timeout: Duration,
     registered: prometheus::IntCounter,
@@ -413,7 +412,7 @@ impl ApiLocks {
 
     pub async fn get_wake_compute_permit(
         &self,
-        key: &SmolStr,
+        key: &EndpointCacheKey,
     ) -> Result<WakeComputePermit, errors::WakeComputeError> {
         if self.permits == 0 {
             return Ok(WakeComputePermit { permit: None });
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index cc35a06708..55f395a403 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,14 +4,13 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
-use crate::cache::Cached;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
+use crate::{auth::IpPattern, cache::Cached};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use smol_str::SmolStr;
-use std::sync::Arc;
+use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
 use tokio_postgres::{config::SslMode, Client};
 use tracing::{error, info, info_span, warn, Instrument};
@@ -88,7 +87,9 @@ impl Api {
             {
                 Some(s) => {
                     info!("got allowed_ips: {s}");
-                    s.split(',').map(String::from).collect()
+                    s.split(',')
+                        .map(|s| IpPattern::from_str(s).unwrap())
+                        .collect()
                 }
                 None => vec![],
             };
@@ -100,7 +101,7 @@ impl Api {
         .await?;
         Ok(AuthInfo {
             secret,
-            allowed_ips: allowed_ips.iter().map(SmolStr::from).collect(),
+            allowed_ips,
             project_id: None,
         })
     }
@@ -150,12 +151,10 @@ impl super::Api for Api {
         &self,
         _ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, GetAuthInfoError> {
-        Ok(self
-            .do_get_auth_info(user_info)
-            .await?
-            .secret
-            .map(CachedRoleSecret::new_uncached))
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(user_info).await?.secret,
+        ))
     }
 
     async fn get_allowed_ips(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index e8e36815c7..33618faed8 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,8 +14,6 @@ use crate::{
 };
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use itertools::Itertools;
-use smol_str::SmolStr;
 use std::sync::Arc;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
@@ -86,20 +84,20 @@ impl Api {
                 },
             };
 
-            let secret = scram::ServerSecret::parse(&body.role_secret)
-                .map(AuthSecret::Scram)
-                .ok_or(GetAuthInfoError::BadSecret)?;
-            let allowed_ips = body
-                .allowed_ips
-                .into_iter()
-                .flatten()
-                .map(SmolStr::from)
-                .collect_vec();
+            let secret = if body.role_secret.is_empty() {
+                None
+            } else {
+                let secret = scram::ServerSecret::parse(&body.role_secret)
+                    .map(AuthSecret::Scram)
+                    .ok_or(GetAuthInfoError::BadSecret)?;
+                Some(secret)
+            };
+            let allowed_ips = body.allowed_ips.unwrap_or_default();
             ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
             Ok(AuthInfo {
-                secret: Some(secret),
+                secret,
                 allowed_ips,
-                project_id: body.project_id.map(SmolStr::from),
+                project_id: body.project_id,
             })
         }
         .map_err(crate::error::log_error)
@@ -172,19 +170,20 @@ impl super::Api for Api {
         &self,
         ctx: &mut RequestMonitoring,
         user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
         let ep = &user_info.endpoint;
         let user = &user_info.user;
         if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
-            return Ok(Some(role_secret));
+            return Ok(role_secret);
         }
         let auth_info = self.do_get_auth_info(ctx, user_info).await?;
         if let Some(project_id) = auth_info.project_id {
-            if let Some(secret) = &auth_info.secret {
-                self.caches
-                    .project_info
-                    .insert_role_secret(&project_id, ep, user, secret.clone())
-            }
+            self.caches.project_info.insert_role_secret(
+                &project_id,
+                ep,
+                user,
+                auth_info.secret.clone(),
+            );
             self.caches.project_info.insert_allowed_ips(
                 &project_id,
                 ep,
@@ -192,7 +191,7 @@ impl super::Api for Api {
             );
         }
         // When we just got a secret, we don't need to invalidate it.
-        Ok(auth_info.secret.map(Cached::new_uncached))
+        Ok(Cached::new_uncached(auth_info.secret))
     }
 
     async fn get_allowed_ips(
@@ -214,11 +213,12 @@ impl super::Api for Api {
         let allowed_ips = Arc::new(auth_info.allowed_ips);
         let user = &user_info.user;
         if let Some(project_id) = auth_info.project_id {
-            if let Some(secret) = &auth_info.secret {
-                self.caches
-                    .project_info
-                    .insert_role_secret(&project_id, ep, user, secret.clone())
-            }
+            self.caches.project_info.insert_role_secret(
+                &project_id,
+                ep,
+                user,
+                auth_info.secret.clone(),
+            );
             self.caches
                 .project_info
                 .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
@@ -238,7 +238,7 @@ impl super::Api for Api {
         // for some time (highly depends on the console's scale-to-zero policy);
         // The connection info remains the same during that period of time,
         // which means that we might cache it to reduce the load and latency.
-        if let Some(cached) = self.caches.node_info.get(&*key) {
+        if let Some(cached) = self.caches.node_info.get(&key) {
             info!(key = &*key, "found cached compute node info");
             return Ok(cached);
         }
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index 8a1aa4aec9..9e2ea10031 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,7 +7,10 @@ use std::net::IpAddr;
 use tokio::sync::mpsc;
 use uuid::Uuid;
 
-use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
+use crate::{
+    console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
+    EndpointId, ProjectId, RoleName,
+};
 
 pub mod parquet;
 
@@ -26,10 +29,10 @@ pub struct RequestMonitoring {
     region: &'static str,
 
     // filled in as they are discovered
-    project: Option<SmolStr>,
-    branch: Option<SmolStr>,
-    endpoint_id: Option<SmolStr>,
-    user: Option<SmolStr>,
+    project: Option<ProjectId>,
+    branch: Option<BranchId>,
+    endpoint_id: Option<EndpointId>,
+    user: Option<RoleName>,
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
     success: bool,
@@ -86,7 +89,7 @@ impl RequestMonitoring {
         self.project = Some(x.project_id);
     }
 
-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
+    pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
         self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
     }
 
@@ -94,7 +97,7 @@ impl RequestMonitoring {
         self.application = app.or_else(|| self.application.clone());
     }
 
-    pub fn set_user(&mut self, user: SmolStr) {
+    pub fn set_user(&mut self, user: RoleName) {
         self.user = Some(user);
     }
 
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a22b2459b8..a9e4a38302 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -62,3 +62,79 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
 pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
     r.context("join error").and_then(|x| x)
 }
+
+macro_rules! smol_str_wrapper {
+    ($name:ident) => {
+        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+        pub struct $name(smol_str::SmolStr);
+
+        impl $name {
+            pub fn as_str(&self) -> &str {
+                self.0.as_str()
+            }
+        }
+
+        impl std::fmt::Display for $name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                self.0.fmt(f)
+            }
+        }
+
+        impl<T> std::cmp::PartialEq<T> for $name
+        where
+            smol_str::SmolStr: std::cmp::PartialEq<T>,
+        {
+            fn eq(&self, other: &T) -> bool {
+                self.0.eq(other)
+            }
+        }
+
+        impl<T> From<T> for $name
+        where
+            smol_str::SmolStr: From<T>,
+        {
+            fn from(x: T) -> Self {
+                Self(x.into())
+            }
+        }
+
+        impl AsRef<str> for $name {
+            fn as_ref(&self) -> &str {
+                self.0.as_ref()
+            }
+        }
+
+        impl std::ops::Deref for $name {
+            type Target = str;
+            fn deref(&self) -> &str {
+                &*self.0
+            }
+        }
+
+        impl<'de> serde::de::Deserialize<'de> for $name {
+            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
+            }
+        }
+
+        impl serde::Serialize for $name {
+            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+                self.0.serialize(s)
+            }
+        }
+    };
+}
+
+// 90% of role name strings are 20 characters or less.
+smol_str_wrapper!(RoleName);
+// 50% of endpoint strings are 23 characters or less.
+smol_str_wrapper!(EndpointId);
+// 50% of branch strings are 23 characters or less.
+smol_str_wrapper!(BranchId);
+// 90% of project strings are 23 characters or less.
+smol_str_wrapper!(ProjectId);
+
+// will usually equal endpoint ID
+smol_str_wrapper!(EndpointCacheKey);
+
+smol_str_wrapper!(DbName);
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 635d157383..087cc7f7a9 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,6 +19,7 @@ use crate::{
     rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     usage_metrics::{Ids, USAGE_METRICS},
+    EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
@@ -26,7 +27,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use regex::Regex;
-use smol_str::SmolStr;
+use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
@@ -516,20 +517,21 @@ impl NeonOptions {
         Self(options)
     }
 
-    pub fn get_cache_key(&self, prefix: &str) -> SmolStr {
+    pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
         // prefix + format!(" {k}:{v}")
         // kinda jank because SmolStr is immutable
         std::iter::once(prefix)
             .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v]))
-            .collect()
+            .collect::<SmolStr>()
+            .into()
     }
 
     /// <https://swagger.io/docs/specification/serialization/> DeepObject format
     /// `paramName[prop1]=value1&paramName[prop2]=value2&...`
-    pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> {
+    pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
         self.0
             .iter()
-            .map(|(k, v)| (format!("options[{}]", k), v.clone()))
+            .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
             .collect()
     }
 }
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 73fde2d7d0..a552a857b9 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -6,13 +6,13 @@ use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
+use crate::auth::IpPattern;
 use crate::config::CertResolver;
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
-use smol_str::SmolStr;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
@@ -471,7 +471,7 @@ impl TestBackend for TestConnectMechanism {
         }
     }
 
-    fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError> {
+    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError> {
         unimplemented!("not used in tests")
     }
 }
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index a190b2cf8f..cbae72711c 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -11,11 +11,12 @@ use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;
 
+use crate::EndpointId;
+
 use super::{
     limit_algorithm::{LimitAlgorithm, Sample},
     RateLimiterConfig,
@@ -33,7 +34,7 @@ use super::{
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
 pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
+    map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
     info: &'static [RateBucketInfo],
     access_count: AtomicUsize,
     rand: Mutex<Rand>,
@@ -146,7 +147,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
     }
 
     /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, endpoint: SmolStr) -> bool {
+    pub fn check(&self, endpoint: EndpointId) -> bool {
         // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
         // worst case memory usage is about:
         //    = 2 * 2048 * 64 * (48B + 72B)
@@ -493,11 +494,13 @@ mod tests {
     use futures::{task::noop_waker_ref, Future};
     use rand::SeedableRng;
     use rustc_hash::FxHasher;
-    use smol_str::SmolStr;
     use tokio::time;
 
     use super::{EndpointRateLimiter, Limiter, Outcome};
-    use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
+    use crate::{
+        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
+        EndpointId,
+    };
 
     #[tokio::test]
     async fn it_works() {
@@ -654,7 +657,7 @@ mod tests {
         RateBucketInfo::validate(&mut rates).unwrap();
         let limiter = EndpointRateLimiter::new(Vec::leak(rates));
 
-        let endpoint = SmolStr::from("ep-my-endpoint-1234");
+        let endpoint = EndpointId::from("ep-my-endpoint-1234");
 
         time::pause();
 
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index d28dcbd1a7..9cd70b109b 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -3,9 +3,8 @@ use std::{convert::Infallible, sync::Arc};
 use futures::StreamExt;
 use redis::aio::PubSub;
 use serde::Deserialize;
-use smol_str::SmolStr;
 
-use crate::cache::project_info::ProjectInfoCache;
+use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};
 
 const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -46,12 +45,12 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    project_id: SmolStr,
+    project_id: ProjectId,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    project_id: SmolStr,
-    role_name: SmolStr,
+    project_id: ProjectId,
+    role_name: RoleName,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index 8af008394a..dfef4ccdfa 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};
 
+pub const SERVERLESS_DRIVER_SNI: &str = "api";
+
 pub async fn task_main(
     config: &'static ProxyConfig,
     ws_listener: TcpListener,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c07cc2816e..5a7279ae63 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -31,6 +31,7 @@ use crate::{
     metrics::NUM_DB_CONNECTIONS_GAUGE,
     proxy::connect_compute::ConnectMechanism,
     usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
+    DbName, EndpointCacheKey, RoleName,
 };
 use crate::{compute, config};
 
@@ -42,17 +43,17 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
-    pub dbname: SmolStr,
+    pub dbname: DbName,
     pub password: SmolStr,
 }
 
 impl ConnInfo {
     // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
+    pub fn db_and_user(&self) -> (DbName, RoleName) {
         (self.dbname.clone(), self.user_info.user.clone())
     }
 
-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
         self.user_info.endpoint_cache_key()
     }
 }
@@ -79,14 +80,14 @@ struct ConnPoolEntry {
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
+    pools: HashMap<(DbName, RoleName), DbUserConnPool>,
     total_conns: usize,
     max_conns: usize,
     _guard: IntCounterPairGuard,
 }
 
 impl EndpointConnPool {
-    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry> {
         let Self {
             pools, total_conns, ..
         } = self;
@@ -95,7 +96,7 @@ impl EndpointConnPool {
             .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
     }
 
-    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
+    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
         let Self {
             pools, total_conns, ..
         } = self;
@@ -196,7 +197,7 @@ pub struct GlobalConnPool {
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
 
     /// Number of endpoint-connection pools
     ///
@@ -440,7 +441,10 @@ impl GlobalConnPool {
         Ok(Client::new(new_client, conn_info, endpoint_pool).await)
     }
 
-    fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(
+        &self,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 9b32ae7f25..1e2ddaa2ff 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;
 
 use anyhow::bail;
+use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
@@ -13,7 +14,6 @@ use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
-use smol_str::SmolStr;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::types::Kind;
@@ -36,9 +36,11 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::RoleName;
 
 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
+use super::SERVERLESS_DRIVER_SNI;
 
 #[derive(serde::Deserialize)]
 struct QueryData {
@@ -60,7 +62,6 @@ enum Payload {
 
 const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
 const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
-const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -155,7 +156,7 @@ fn get_conn_info(
         .next()
         .ok_or(anyhow::anyhow!("invalid database name"))?;
 
-    let username = SmolStr::from(connection_url.username());
+    let username = RoleName::from(connection_url.username());
     if username.is_empty() {
         return Err(anyhow::anyhow!("missing username"));
     }
@@ -187,9 +188,7 @@ fn get_conn_info(
         }
     }
 
-    let endpoint = endpoint_sni(hostname, &tls.common_names)?;
-
-    let endpoint: SmolStr = endpoint.into();
+    let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
     ctx.set_endpoint_id(Some(endpoint.clone()));
 
     let pairs = connection_url.query_pairs();
@@ -226,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
     let (_, hostname_rest) = hostname
         .split_once('.')
         .ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
-    Ok(sni_hostname_rest == hostname_rest
-        && sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
+    Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
 }
 
 // TODO: return different http error codes
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index 789a4c680c..d75aedf89b 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,12 +1,11 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{config::MetricCollectionConfig, http};
+use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
-use smol_str::SmolStr;
 use std::{
     convert::Infallible,
     sync::{
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub branch_id: BranchId,
 }
 
 #[derive(Debug)]
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 8810252e0b..fbba2e00fc 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -110,7 +110,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
 pub static BACKED_UP_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "safekeeper_backed_up_segments_total",
-        "Number of WAL segments backed up to the broker"
+        "Number of WAL segments backed up to the S3"
     )
     .expect("Failed to register safekeeper_backed_up_segments_total counter")
 });
@@ -337,6 +337,7 @@ pub struct TimelineCollector {
     flushed_wal_seconds: GaugeVec,
     collect_timeline_metrics: Gauge,
     timelines_count: IntGauge,
+    active_timelines_count: IntGauge,
 }
 
 impl Default for TimelineCollector {
@@ -520,6 +521,13 @@ impl TimelineCollector {
         .unwrap();
         descs.extend(timelines_count.desc().into_iter().cloned());
 
+        let active_timelines_count = IntGauge::new(
+            "safekeeper_active_timelines",
+            "Total number of active timelines",
+        )
+        .unwrap();
+        descs.extend(active_timelines_count.desc().into_iter().cloned());
+
         TimelineCollector {
             descs,
             commit_lsn,
@@ -540,6 +548,7 @@ impl TimelineCollector {
             flushed_wal_seconds,
             collect_timeline_metrics,
             timelines_count,
+            active_timelines_count,
         }
     }
 }
@@ -572,6 +581,7 @@ impl Collector for TimelineCollector {
 
         let timelines = GlobalTimelines::get_all();
         let timelines_count = timelines.len();
+        let mut active_timelines_count = 0;
 
         // Prometheus Collector is sync, and data is stored under async lock. To
         // bridge the gap with a crutch, collect data in spawned thread with
@@ -590,6 +600,10 @@ impl Collector for TimelineCollector {
             let timeline_id = tli.ttid.timeline_id.to_string();
             let labels = &[tenant_id.as_str(), timeline_id.as_str()];
 
+            if tli.timeline_is_active {
+                active_timelines_count += 1;
+            }
+
             self.commit_lsn
                 .with_label_values(labels)
                 .set(tli.mem_state.commit_lsn.into());
@@ -681,6 +695,8 @@ impl Collector for TimelineCollector {
 
         // report total number of timelines
         self.timelines_count.set(timelines_count as i64);
+        self.active_timelines_count
+            .set(active_timelines_count as i64);
         mfs.extend(self.timelines_count.collect());
 
         mfs
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index d96eedf401..9dce06a886 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -7,12 +7,21 @@ use tracing::*;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
 
+const ALLOW_INACTIVE_TIMELINES: bool = true;
+
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
     let wal_removal_interval = Duration::from_millis(5000);
     loop {
+        let now = tokio::time::Instant::now();
+        let mut active_timelines = 0;
+
         let tlis = GlobalTimelines::get_all();
         for tli in &tlis {
-            if !tli.is_active().await {
+            let is_active = tli.is_active().await;
+            if is_active {
+                active_timelines += 1;
+            }
+            if !ALLOW_INACTIVE_TIMELINES && !is_active {
                 continue;
             }
             let ttid = tli.ttid;
@@ -27,6 +36,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
             .instrument(info_span!("WAL removal", ttid = %ttid))
             .await;
         }
+
+        let elapsed = now.elapsed();
+        let total_timelines = tlis.len();
+
+        if elapsed > wal_removal_interval {
+            info!(
+                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
+                active_timelines, total_timelines, elapsed
+            );
+        }
+
         sleep(wal_removal_interval).await;
     }
 }
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index b07e4bea9b..61a97f520d 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -3,6 +3,7 @@
 import argparse
 import json
 import logging
+import os
 from collections import defaultdict
 from typing import DefaultDict, Dict
 
@@ -45,6 +46,15 @@ def main(args: argparse.Namespace):
         logging.error("cannot fetch flaky tests from the DB due to an error", exc)
         rows = []
 
+    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not std-fs),
+    # use it to parametrize test name along with build_type and pg_version
+    #
+    # See test_runner/fixtures/parametrize.py for details
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
+    else:
+        pageserver_virtual_file_io_engine_parameter = ""
+
     for row in rows:
         # We don't want to automatically rerun tests in a performance suite
         if row["parent_suite"] != "test_runner.regress":
@@ -53,10 +63,10 @@ def main(args: argparse.Namespace):
         if row["name"].endswith("]"):
             parametrized_test = row["name"].replace(
                 "[",
-                f"[{build_type}-pg{pg_version}-",
+                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-",
             )
         else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]"
 
         res[row["parent_suite"]][row["suite"]][parametrized_test] = True
 
diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store
index 00d37e5f83..4cca3a9857 100755
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -39,6 +39,9 @@ SETUP COMPLETE
 To run your local neon.git build on the instance store volume,
 run the following commands from the top of the neon.git checkout
 
+    # raise file descriptor limit of your shell and its child processes
+    sudo prlimit -p $$ --nofile=800000:800000
+
     # test suite run
     export TEST_OUTPUT="$TEST_OUTPUT"
     DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d98aedf4d0..bbabfeedf6 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import abc
 import asyncio
+import concurrent.futures
 import filecmp
 import json
 import os
@@ -10,16 +11,18 @@ import shutil
 import subprocess
 import tempfile
 import textwrap
+import threading
 import time
 import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
+from fcntl import LOCK_EX, LOCK_UN, flock
 from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import urlparse
 
 import asyncpg
@@ -49,7 +52,10 @@ from fixtures.pageserver.allowed_errors import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
@@ -424,6 +430,7 @@ class NeonEnvBuilder:
         pg_distrib_dir: Path,
         pg_version: PgVersion,
         test_name: str,
+        top_output_dir: Path,
         test_output_dir: Path,
         test_overlay_dir: Optional[Path] = None,
         pageserver_remote_storage: Optional[RemoteStorage] = None,
@@ -440,6 +447,7 @@ class NeonEnvBuilder:
         preserve_database_files: bool = False,
         initial_tenant: Optional[TenantId] = None,
         initial_timeline: Optional[TimelineId] = None,
+        pageserver_virtual_file_io_engine: Optional[str] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -473,6 +481,9 @@ class NeonEnvBuilder:
         self.test_overlay_dir = test_overlay_dir
         self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
         self.config_init_force: Optional[str] = None
+        self.top_output_dir = top_output_dir
+
+        self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
         assert test_name.startswith(
             "test_"
@@ -526,6 +537,64 @@ class NeonEnvBuilder:
 
         return env
 
+    def build_and_use_snapshot(
+        self, global_ident: str, create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv]
+    ) -> NeonEnv:
+        if os.getenv("CI", "false") == "true":
+            log.info("do not use snapshots in ephemeral CI environment")
+            env = create_env_for_snapshot(self)
+            env.stop(immediate=True, ps_assert_metric_no_errors=False)
+            return env
+
+        with shared_snapshot_dir(self.top_output_dir, global_ident) as snapshot_dir:
+            if not snapshot_dir.is_initialized():
+                self._build_and_use_snapshot_impl(snapshot_dir, create_env_for_snapshot)
+                assert snapshot_dir.is_initialized()
+
+            return self.from_repo_dir(snapshot_dir.path)
+
+    def _build_and_use_snapshot_impl(
+        self,
+        snapshot_dir: SnapshotDirLocked,
+        create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv],
+    ):
+        if snapshot_dir.path.exists():
+            shutil.rmtree(snapshot_dir.path)
+
+        if self.test_overlay_dir is not None:
+            # Make repo_dir an overlayfs mount with lowerdir being the empty snapshot_dir.
+            # When we're done filling up repo_dir, tear everything down, unmount the overlayfs, and use
+            # the upperdir as the snapshot. This is equivalent to docker `FROM scratch`.
+            assert not self.repo_dir.exists()
+            assert self.repo_dir.parent.exists()
+            snapshot_dir.path.mkdir()
+            self.overlay_mount("create-snapshot-repo-dir", snapshot_dir.path, self.repo_dir)
+            self.config_init_force = "empty-dir-ok"
+
+        env = create_env_for_snapshot(self)
+        assert self.env is not None
+        assert self.env == env
+
+        # shut down everything for snapshot
+        env.stop(immediate=True, ps_assert_metric_no_errors=True)
+
+        # TODO: all kinds of assertions to ensure the env is unused
+
+        if self.test_overlay_dir is None:
+            log.info("take snapshot by moving repo dir")
+            env.repo_dir.rename(snapshot_dir.path)
+        else:
+            log.info("take snapshot by using overlayfs upperdir")
+            self.overlay_unmount_and_move("create-snapshot-repo-dir", snapshot_dir.path)
+            log.info("remove empty repo_dir (previously mountpoint) for snapshot overlay_mount")
+            env.repo_dir.rmdir()
+            # TODO from here on, we should be able to reset / goto top where snapshot_dir.is_initialized()
+            log.info("make repo_dir an overlayfs mount of the snapshot we just created")
+        assert not env.repo_dir.exists(), "both branches above should remove it"
+        snapshot_dir.set_initialized()
+
+        self.env = None  # so that from_repo_dir works again
+
     def from_repo_dir(
         self,
         repo_dir: Path,
@@ -557,10 +626,15 @@ class NeonEnvBuilder:
             tenants_from_dir = ps_dir / "tenants"
             tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"
 
-            log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
             if self.test_overlay_dir is None:
+                log.info(
+                    f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
+                )
                 shutil.copytree(tenants_from_dir, tenants_to_dir)
             else:
+                log.info(
+                    f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
+                )
                 self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir)
 
         for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"):
@@ -571,10 +645,12 @@ class NeonEnvBuilder:
 
         shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
         if self.test_overlay_dir is None:
+            log.info("Copying local_fs_remote_storage directory from snapshot")
             shutil.copytree(
                 repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
             )
         else:
+            log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot")
             self.overlay_mount(
                 "local_fs_remote_storage",
                 repo_dir / "local_fs_remote_storage",
@@ -631,6 +707,54 @@ class NeonEnvBuilder:
         )
         self.overlay_mounts_created_by_us.append((ident, dstdir))
 
+    def _overlay_umount(self, mountpoint: Path):
+        cmd = ["sudo", "umount", str(mountpoint)]
+        assert mountpoint.is_mount()
+        subprocess_capture(
+            self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+        )
+
+    def overlay_unmount_and_move(self, ident: str, dst: Path):
+        """
+        Unmount previously established overlayfs mount at `dstdir` and move the upperdir contents to `dst`.
+        If `dst` is an empty directory, it gets replaced.
+        Caller is responsible for ensuring the unmount will succeed, i.e., that there aren't any nested mounts.
+
+        Raises exception if self.test_overlay_dir is None
+        """
+        assert self.test_overlay_dir is not None
+        # not mutating state yet, make checks
+        ident_state_dir = self.test_overlay_dir / ident
+        assert ident_state_dir.is_dir()
+        upper = ident_state_dir / "upper"
+        work = ident_state_dir / "work"
+        assert upper.is_dir()
+        assert work.is_dir()
+        assert (
+            self.test_overlay_dir not in dst.parents
+        ), "otherwise workdir cleanup below wouldn't work"
+        # find index, still not mutating state
+        idxmap = {
+            existing_ident: idx
+            for idx, (existing_ident, _) in enumerate(self.overlay_mounts_created_by_us)
+        }
+        idx = idxmap.get(ident)
+        if idx is None:
+            raise RuntimeError(f"cannot find mount for ident {ident}")
+
+        if dst.is_dir():
+            dst.rmdir()  # raises exception if not empty, which is what we want
+
+        _, mountpoint = self.overlay_mounts_created_by_us.pop(idx)
+        self._overlay_umount(mountpoint)
+        upper.rename(dst)
+        # we moved the upperdir, clean up workdir and then its parent ident_state_dir
+        cmd = ["sudo", "rm", "-rf", str(work)]
+        subprocess_capture(
+            self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+        )
+        ident_state_dir.rmdir()  # should be empty since we moved `upper` out
+
     def overlay_cleanup_teardown(self):
         """
         Unmount the overlayfs mounts created by `self.overlay_mount()`.
@@ -641,13 +765,10 @@ class NeonEnvBuilder:
         while len(self.overlay_mounts_created_by_us) > 0:
             (ident, mountpoint) = self.overlay_mounts_created_by_us.pop()
             ident_state_dir = self.test_overlay_dir / ident
-            cmd = ["sudo", "umount", str(mountpoint)]
             log.info(
-                f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}"
-            )
-            subprocess_capture(
-                self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+                f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}"
             )
+            self._overlay_umount(mountpoint)
             log.info(
                 f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}"
             )
@@ -725,8 +846,15 @@ class NeonEnvBuilder:
         if self.preserve_database_files:
             return
 
+        overlayfs_mounts = {mountpoint for _, mountpoint in self.overlay_mounts_created_by_us}
+
         directories_to_clean: List[Path] = []
         for test_entry in Path(self.repo_dir).glob("**/*"):
+            if test_entry in overlayfs_mounts:
+                continue
+            for parent in test_entry.parents:
+                if parent in overlayfs_mounts:
+                    continue
             if test_entry.is_file():
                 test_file = test_entry
                 if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name):
@@ -775,13 +903,6 @@ class NeonEnvBuilder:
                     log.error(f"Error during remote storage scrub: {e}")
                     cleanup_error = e
 
-            try:
-                self.overlay_cleanup_teardown()
-            except Exception as e:
-                log.error(f"Error cleaning up overlay state: {e}")
-                if cleanup_error is not None:
-                    cleanup_error = e
-
             try:
                 self.cleanup_remote_storage()
             except Exception as e:
@@ -802,6 +923,13 @@ class NeonEnvBuilder:
             for pageserver in self.env.pageservers:
                 pageserver.assert_no_errors()
 
+        try:
+            self.overlay_cleanup_teardown()
+        except Exception as e:
+            log.error(f"Error cleaning up overlay state: {e}")
+            if cleanup_error is not None:
+                cleanup_error = e
+
 
 class NeonEnv:
     """
@@ -866,11 +994,18 @@ class NeonEnv:
         self.initial_timeline = config.initial_timeline
 
         attachment_service_port = self.port_distributor.get_port()
+        # Reserve the next port after attachment service for use by its postgres: this
+        # will assert out if the next port wasn't free.
+        attachment_service_pg_port = self.port_distributor.get_port()
+        assert attachment_service_pg_port == attachment_service_port + 1
+
         self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
         self.attachment_service: NeonAttachmentService = NeonAttachmentService(
             self, config.auth_enabled
         )
 
+        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
+
         # Create a config file corresponding to the options
         cfg: Dict[str, Any] = {
             "default_tenant_id": str(self.initial_tenant),
@@ -902,6 +1037,9 @@ class NeonEnv:
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
             }
+            if self.pageserver_virtual_file_io_engine is not None:
+                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
+
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
                 NeonPageserver(
@@ -939,16 +1077,27 @@ class NeonEnv:
         self.neon_cli.init(cfg, force=config.config_init_force)
 
     def start(self):
-        # Start up broker, pageserver and all safekeepers
-        self.broker.try_start()
-
+        # Attachment service starts first, so that pageserver /re-attach calls don't
+        # bounce through retries on startup
         self.attachment_service.start()
 
-        for pageserver in self.pageservers:
-            pageserver.start()
+        # Start up broker, pageserver and all safekeepers
+        futs = []
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=2 + len(self.pageservers) + len(self.safekeepers)
+        ) as executor:
+            futs.append(
+                executor.submit(lambda: self.broker.try_start() or None)
+            )  # The `or None` is for the linter
 
-        for safekeeper in self.safekeepers:
-            safekeeper.start()
+            for pageserver in self.pageservers:
+                futs.append(executor.submit(lambda ps=pageserver: ps.start()))
+
+            for safekeeper in self.safekeepers:
+                futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
+
+        for f in futs:
+            f.result()
 
     def stop(self, immediate=False, ps_assert_metric_no_errors=False):
         """
@@ -971,7 +1120,9 @@ class NeonEnv:
         assert that there is only one. Tests with multiple pageservers should always use
         get_pageserver with an explicit ID.
         """
-        assert len(self.pageservers) == 1
+        assert (
+            len(self.pageservers) == 1
+        ), "env.pageserver must only be used with single pageserver NeonEnv"
         return self.pageservers[0]
 
     def get_pageserver(self, id: Optional[int]) -> NeonPageserver:
@@ -1065,6 +1216,7 @@ def _shared_simple_env(
     neon_binpath: Path,
     pg_distrib_dir: Path,
     pg_version: PgVersion,
+    pageserver_virtual_file_io_engine: str,
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1082,6 +1234,7 @@ def _shared_simple_env(
         shutil.rmtree(repo_dir, ignore_errors=True)
 
     with NeonEnvBuilder(
+        top_output_dir=top_output_dir,
         repo_dir=repo_dir,
         port_distributor=port_distributor,
         broker=default_broker,
@@ -1093,6 +1246,7 @@ def _shared_simple_env(
         preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
         test_name=request.node.name,
         test_output_dir=test_output_dir,
+        pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
     ) as builder:
         env = builder.init_start()
 
@@ -1130,6 +1284,8 @@ def neon_env_builder(
     run_id: uuid.UUID,
     request: FixtureRequest,
     test_overlay_dir: Path,
+    top_output_dir: Path,
+    pageserver_virtual_file_io_engine: str,
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1149,6 +1305,7 @@ def neon_env_builder(
 
     # Return the builder to the caller
     with NeonEnvBuilder(
+        top_output_dir=top_output_dir,
         repo_dir=Path(repo_dir),
         port_distributor=port_distributor,
         mock_s3_server=mock_s3_server,
@@ -1158,6 +1315,7 @@ def neon_env_builder(
         broker=default_broker,
         run_id=run_id,
         preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
+        pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
@@ -1511,8 +1669,10 @@ class NeonCli(AbstractNeonCli):
         id: int,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "subprocess.CompletedProcess[str]":
-        start_args = ["pageserver", "start", f"--id={id}", *overrides]
+        register_str = "true" if register else "false"
+        start_args = ["pageserver", "start", f"--id={id}", *overrides, f"--register={register_str}"]
         storage = self.env.pageserver_remote_storage
         append_pageserver_param_overrides(
             params_to_update=start_args,
@@ -1939,6 +2099,7 @@ class NeonPageserver(PgProtocol):
         self,
         overrides: Tuple[str, ...] = (),
         extra_env_vars: Optional[Dict[str, str]] = None,
+        register: bool = True,
     ) -> "NeonPageserver":
         """
         Start the page server.
@@ -1948,7 +2109,7 @@ class NeonPageserver(PgProtocol):
         assert self.running is False
 
         self.env.neon_cli.pageserver_start(
-            self.id, overrides=overrides, extra_env_vars=extra_env_vars
+            self.id, overrides=overrides, extra_env_vars=extra_env_vars, register=register
         )
         self.running = True
         return self
@@ -2914,6 +3075,7 @@ class Endpoint(PgProtocol):
 
         # Write it back updated
         with open(config_path, "w") as file:
+            log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
     # Mock the extension part of spec passed from control plane for local testing
@@ -3486,6 +3648,10 @@ def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return _get_test_dir(request, top_output_dir, "overlay-")
 
 
+def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path:
+    return top_output_dir / "shared-snapshots" / snapshot_name
+
+
 def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
     return get_test_output_dir(request, top_output_dir) / "repo"
 
@@ -3532,6 +3698,75 @@ def test_output_dir(
     allure_attach_from_dir(test_dir)
 
 
+class FileAndThreadLock:
+    def __init__(self, path: Path):
+        self.path = path
+        self.thread_lock = threading.Lock()
+        self.fd: Optional[int] = None
+
+    def __enter__(self):
+        self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY)
+        # lock thread lock before file lock so that there's no race
+        # around flocking / funlocking the file lock
+        self.thread_lock.acquire()
+        flock(self.fd, LOCK_EX)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        assert self.fd is not None
+        assert self.thread_lock.locked()  # ... by us
+        flock(self.fd, LOCK_UN)
+        self.thread_lock.release()
+        os.close(self.fd)
+        self.fd = None
+
+
+class SnapshotDirLocked:
+    def __init__(self, parent: SnapshotDir):
+        self._parent = parent
+
+    def is_initialized(self):
+        # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized.
+        # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed.
+        return self._parent._marker_file_path.exists()
+
+    def set_initialized(self):
+        self._parent._marker_file_path.write_text("")
+
+    @property
+    def path(self) -> Path:
+        return self._parent._path / "snapshot"
+
+
+class SnapshotDir:
+    _path: Path
+
+    def __init__(self, path: Path):
+        self._path = path
+        assert self._path.is_dir()
+        self._lock = FileAndThreadLock(self._lock_file_path)
+
+    @property
+    def _lock_file_path(self) -> Path:
+        return self._path / "initializing.flock"
+
+    @property
+    def _marker_file_path(self) -> Path:
+        return self._path / "initialized.marker"
+
+    def __enter__(self) -> SnapshotDirLocked:
+        self._lock.__enter__()
+        return SnapshotDirLocked(self)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self._lock.__exit__(exc_type, exc_value, exc_traceback)
+
+
+def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir:
+    snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident)
+    snapshot_dir_path.mkdir(exist_ok=True, parents=True)
+    return SnapshotDir(snapshot_dir_path)
+
+
 @pytest.fixture(scope="function")
 def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]:
     """
@@ -3541,7 +3776,7 @@ def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[
     The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc).
     """
 
-    if os.getenv("NEON_ENV_BUILDER_FROM_REPO_DIR_USE_OVERLAYFS") is None:
+    if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None:
         return None
 
     overlay_dir = get_test_overlay_dir(request, top_output_dir)
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index cfa2a2674d..340cc9e9e3 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -20,6 +20,7 @@ from fixtures.utils import Fn
 class PageserverApiException(Exception):
     def __init__(self, message, status_code: int):
         super().__init__(message)
+        self.message = message
         self.status_code = status_code
 
 
@@ -261,12 +262,18 @@ class PageserverHttpClient(requests.Session):
         )
         self.verbose_error(res)
 
-    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
+    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None):
         params = {}
         if detach_ignored:
             params["detach_ignored"] = "true"
 
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
+        kwargs = {}
+        if timeout_secs is not None:
+            kwargs["timeout"] = timeout_secs
+
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs
+        )
         self.verbose_error(res)
 
     def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
@@ -526,6 +533,17 @@ class PageserverHttpClient(requests.Session):
         res_json = res.json()
         assert res_json is None
 
+    def timeline_preserve_initdb_archive(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        log.info(
+            f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}"
+        )
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive",
+        )
+        self.verbose_error(res)
+
     def timeline_get_lsn_by_timestamp(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
new file mode 100644
index 0000000000..bbb4ccee5b
--- /dev/null
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -0,0 +1,85 @@
+import concurrent.futures
+import time
+from typing import Any, Callable, Dict, Tuple
+
+import fixtures.pageserver.remote_storage
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.utils import (
+    wait_until_tenant_state,
+)
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.types import TenantId, TimelineId
+
+
+def single_timeline(
+    neon_env_builder: NeonEnvBuilder,
+    setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+    ncopies: int,
+) -> NeonEnv:
+    """
+    Create `ncopies` duplicates of a template tenant that has a single timeline.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start()
+
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+
+    ps_http = env.pageserver.http_client()
+    # clean up the useless default tenant
+    ps_http.tenant_delete(env.initial_tenant)
+
+    log.info("invoking callback to create template tenant")
+    template_tenant, template_timeline, template_config = setup_template(env)
+    log.info(
+        f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}"
+    )
+
+    log.info("detach template tenant form pageserver")
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+
+    log.info(f"duplicating template tenant {ncopies} times in S3")
+    tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
+
+    log.info("attach duplicated tenants to pageserver")
+    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
+    # However, on-demand downloads are quite slow ATM.
+    # => do the on-demand downloads in Python.
+    assert ps_http.tenant_list() == []
+    # make the attach fail after it created enough on-disk state to retry loading
+    # the tenant next startup, but before it can start background loops that would start download
+    ps_http.configure_failpoints(("attach-before-activate", "return"))
+    env.pageserver.allowed_errors.append(
+        ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
+    )
+
+    def attach_broken(tenant):
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config.copy(),
+        )
+        time.sleep(0.1)
+        wait_until_tenant_state(ps_http, tenant, "Broken", 10)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
+        executor.map(attach_broken, tenants)
+
+    env.pageserver.stop(
+        immediate=True
+    )  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
+    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+    log.info("python-side on-demand download the layer files into local tenant dir")
+    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+        env, tenant_timelines
+    )
+
+    return env
diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py
new file mode 100644
index 0000000000..e6cd9b4614
--- /dev/null
+++ b/test_runner/fixtures/pageserver/remote_storage.py
@@ -0,0 +1,116 @@
+import concurrent.futures
+import os
+import queue
+import shutil
+import threading
+from pathlib import Path
+from typing import Any, List, Tuple
+
+from fixtures.neon_fixtures import NeonEnv, Pagectl
+from fixtures.pageserver.types import (
+    InvalidFileName,
+    parse_layer_file_name,
+)
+from fixtures.remote_storage import LocalFsStorage
+from fixtures.types import TenantId, TimelineId
+
+
+def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+
+    src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines"
+    assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
+
+    assert isinstance(remote_storage, LocalFsStorage)
+    dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines"
+    dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
+    dst_timelines_dir.mkdir(parents=False, exist_ok=False)
+
+    for tl in src_timelines_dir.iterdir():
+        src_tl_dir = src_timelines_dir / tl.name
+        assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
+        dst_tl_dir = dst_timelines_dir / tl.name
+        dst_tl_dir.mkdir(parents=False, exist_ok=False)
+        for file in tl.iterdir():
+            shutil.copy2(file, dst_tl_dir)
+            if "__" in file.name:
+                Pagectl(env).raw_cli(
+                    [
+                        "layer",
+                        "rewrite-summary",
+                        str(dst_tl_dir / file.name),
+                        "--new-tenant-id",
+                        str(new_tenant),
+                    ]
+                )
+            else:
+                # index_part etc need no patching
+                pass
+    return None
+
+
+def duplicate_tenant(env: NeonEnv, template_tenant: TenantId, ncopies: int) -> List[TenantId]:
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    def work(tenant_id):
+        duplicate_one_tenant(env, template_tenant, tenant_id)
+
+    new_tenants: List[TenantId] = [TenantId.generate() for _ in range(0, ncopies)]
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+        executor.map(work, new_tenants)
+    return new_tenants
+
+
+def local_layer_name_from_remote_name(remote_name: str) -> str:
+    try:
+        return parse_layer_file_name(remote_name).to_str()
+    except InvalidFileName as e:
+        comps = remote_name.rsplit("-", 1)
+        if len(comps) == 1:
+            raise InvalidFileName("no generation suffix found") from e
+        else:
+            assert len(comps) == 2
+            layer_file_name, _generation = comps
+            try:
+                return parse_layer_file_name(layer_file_name).to_str()
+            except InvalidFileName:
+                raise
+
+
+def copy_all_remote_layer_files_to_local_tenant_dir(
+    env: NeonEnv, tenant_timelines: List[Tuple[TenantId, TimelineId]]
+):
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+    work: queue.Queue[Any] = queue.Queue()
+    for tenant, timeline in tenant_timelines:
+        remote_timeline_path = remote_storage.timeline_path(tenant, timeline)
+        local_timeline_path = env.pageserver.timeline_dir(tenant, timeline)
+        local_timeline_path.mkdir(parents=True, exist_ok=True)
+        downloads = {}
+        for remote_layer in remote_timeline_path.glob("*__*"):
+            local_name = local_layer_name_from_remote_name(remote_layer.name)
+            assert local_name not in downloads, "remote storage must have had split brain"
+            downloads[local_name] = remote_layer
+        for local_name, remote_path in downloads.items():
+            work.put((remote_path, local_timeline_path / local_name))
+
+    def copy_layer_worker(queue):
+        while True:
+            item = queue.get()
+            if item is None:
+                return
+            remote_path, local_path = item
+            # not copy2, so it looks like a recent download, in case that's relevant to e.g. eviction
+            shutil.copy(remote_path, local_path, follow_symlinks=False)
+
+    workers = []
+    n_threads = os.cpu_count() or 1
+    for _ in range(0, n_threads):
+        w = threading.Thread(target=copy_layer_worker, args=[work])
+        workers.append(w)
+        w.start()
+        work.put(None)
+    for w in workers:
+        w.join()
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/types.py
index b3c1174b35..72fa30a2f2 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -31,10 +31,10 @@ class DeltaLayerFileName:
     key_start: Key
     key_end: Key
 
-    def is_l0(self):
+    def is_l0(self) -> bool:
         return self.key_start == KEY_MIN and self.key_end == KEY_MAX
 
-    def to_str(self):
+    def to_str(self) -> str:
         ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}"
         assert self == parse_layer_file_name(ret)
         return ret
@@ -107,7 +107,7 @@ def parse_layer_file_name(file_name: str) -> LayerFileName:
     except InvalidFileName:
         pass
 
-    raise ValueError()
+    raise InvalidFileName("neither image nor delta layer")
 
 
 def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index a6c4b8e930..6b2651e447 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -63,6 +63,14 @@ def wait_for_upload(
     )
 
 
+def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str):
+    if tenant_info["state"]["slug"] == expected_state:
+        return True
+    if tenant_info["state"]["slug"] == "Broken":
+        raise RuntimeError(f"tenant became Broken, not {expected_state}")
+    return False
+
+
 def wait_until_tenant_state(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
@@ -80,10 +88,8 @@ def wait_until_tenant_state(
             log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
         else:
             log.debug(f"Tenant {tenant_id} data: {tenant}")
-            if tenant["state"]["slug"] == expected_state:
+            if _tenant_in_expected_state(tenant, expected_state):
                 return tenant
-            if tenant["state"]["slug"] == "Broken":
-                raise RuntimeError(f"tenant became Broken, not {expected_state}")
 
         time.sleep(period)
 
@@ -92,6 +98,34 @@ def wait_until_tenant_state(
     )
 
 
+def wait_until_all_tenants_state(
+    pageserver_http: PageserverHttpClient,
+    expected_state: str,
+    iterations: int,
+    period: float = 1.0,
+    http_error_ok: bool = True,
+):
+    """
+    Like wait_until_tenant_state, but checks all tenants.
+    """
+    for _ in range(iterations):
+        try:
+            tenants = pageserver_http.tenant_list()
+        except Exception as e:
+            if http_error_ok:
+                log.debug(f"Failed to list tenants: {e}")
+            else:
+                raise
+        else:
+            if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)):
+                return
+        time.sleep(period)
+
+    raise Exception(
+        f"Not all tenants became active {expected_state} within {iterations * period} seconds"
+    )
+
+
 def wait_until_timeline_state(
     pageserver_http: PageserverHttpClient,
     tenant_id: Union[TenantId, TenantShardId],
@@ -337,8 +371,24 @@ def tenant_delete_wait_completed(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
     iterations: int,
+    ignore_errors: bool = False,
 ):
-    pageserver_http.tenant_delete(tenant_id=tenant_id)
+    if not ignore_errors:
+        pageserver_http.tenant_delete(tenant_id=tenant_id)
+    else:
+        interval = 0.5
+
+        def delete_request_sent():
+            try:
+                pageserver_http.tenant_delete(tenant_id=tenant_id)
+            except PageserverApiException as e:
+                log.debug(e)
+                if e.status_code == 404:
+                    return
+            except Exception as e:
+                log.debug(e)
+
+        wait_until(iterations, interval=interval, func=delete_request_sent)
     wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
 
 
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 53350138dd..d8ac92abb6 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -8,7 +8,7 @@ from _pytest.python import Metafunc
 from fixtures.pg_version import PgVersion
 
 """
-Dynamically parametrize tests by Postgres version and build type (debug/release/remote)
+Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters
 """
 
 
@@ -31,11 +31,12 @@ def build_type(request: FixtureRequest) -> Optional[str]:
     return None
 
 
-def pytest_generate_tests(metafunc: Metafunc):
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in metafunc.definition._nodeid:
-        return
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]:
+    return None
 
+
+def pytest_generate_tests(metafunc: Metafunc):
     if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
         pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
     else:
@@ -46,5 +47,12 @@ def pytest_generate_tests(metafunc: Metafunc):
     else:
         build_types = [bt.lower()]
 
-    metafunc.parametrize("build_type", build_types)
-    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+    # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first
+    if "test_runner/performance" not in metafunc.definition._nodeid:
+        metafunc.parametrize("build_type", build_types)
+        metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+
+    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
+    # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
+    if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+        metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index cda788b2a4..91f33e1196 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -397,3 +397,36 @@ def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
     }
     """
     pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
+
+
+def humantime_to_ms(humantime: str) -> float:
+    """
+    Converts Rust humantime's output string to milliseconds.
+
+    humantime_to_ms("1h 1ms 406us") -> 3600001.406
+    """
+
+    unit_multiplier_map = {
+        "ns": 1e-6,
+        "us": 1e-3,
+        "ms": 1,
+        "s": 1e3,
+        "m": 1e3 * 60,
+        "h": 1e3 * 60 * 60,
+    }
+    matcher = re.compile(rf"^(\d+)({'|'.join(unit_multiplier_map.keys())})$")
+    total_ms = 0.0
+
+    if humantime == "0":
+        return total_ms
+
+    for item in humantime.split():
+        if (match := matcher.search(item)) is not None:
+            n, unit = match.groups()
+            total_ms += int(n) * unit_multiplier_map[unit]
+        else:
+            raise ValueError(
+                f"can't parse '{item}' (from string '{humantime}'), known units are {', '.join(unit_multiplier_map.keys())}."
+            )
+
+    return round(total_ms, 3)
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 30def1194d..f29a6cbf3c 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -21,12 +21,21 @@ class Workload:
     - reads, checking we get the right data (`validate`)
     """
 
-    def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
+    def __init__(
+        self,
+        env: NeonEnv,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        branch_name: Optional[str] = None,
+    ):
         self.env = env
         self.tenant_id = tenant_id
         self.timeline_id = timeline_id
         self.table = "foo"
 
+        # By default, use the default branch name for initial tenant in NeonEnv
+        self.branch_name = branch_name or "main"
+
         self.expect_rows = 0
         self.churn_cursor = 0
 
@@ -35,7 +44,7 @@ class Workload:
     def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
         if self._endpoint is None:
             self._endpoint = self.env.endpoints.create(
-                "main",
+                self.branch_name,
                 tenant_id=self.tenant_id,
                 pageserver_id=pageserver_id,
                 endpoint_id="ep-workload",
diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md
new file mode 100644
index 0000000000..fdd09cd946
--- /dev/null
+++ b/test_runner/performance/pageserver/README.md
@@ -0,0 +1,16 @@
+How to reproduce benchmark results / run these benchmarks interactively.
+
+1. Get an EC2 instance with Instance Store. Use the same instance type as used for the benchmark run.
+2. Mount the Instance Store => `neon.git/scripts/ps_ec2_setup_instance_store`
+3. Use a pytest command line (see other READMEs further up in the pytest hierarchy).
+
+For tests that take a long time to set up / consume a lot of storage space,
+we use the test suite's repo_dir snapshotting functionality (`from_repo_dir`).
+It supports mounting snapshots using overlayfs, which improves iteration time.
+
+Here's a full command line.
+
+```
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \
+    ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+````
diff --git a/test_runner/performance/pageserver/__init__.py b/test_runner/performance/pageserver/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/performance/pageserver/interactive/__init__.py b/test_runner/performance/pageserver/interactive/__init__.py
new file mode 100644
index 0000000000..29644c240e
--- /dev/null
+++ b/test_runner/performance/pageserver/interactive/__init__.py
@@ -0,0 +1,8 @@
+"""
+Tests that aren't really tests or benchmarks.
+
+They're intended for the case where we want to standardize & automate setup,
+but then debug a performance problem interactively.
+It's kind of an abuse of the test framework, but, it's our only tool right
+now to automate a complex test bench setup.
+"""
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
new file mode 100644
index 0000000000..3fb28ace46
--- /dev/null
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -0,0 +1,79 @@
+import os
+import pdb
+
+import fixtures.pageserver.many_tenants as many_tenants
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+)
+
+from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+
+"""
+Usage:
+DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+    ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+"""
+
+
+@pytest.mark.skipif(
+    os.environ.get("INTERACTIVE", "false") != "true",
+    reason="test is for interactive use only",
+)
+def test_many_small_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    _env = setup_env(neon_env_builder, 2)  # vary this to the desired number of tenants
+    _pg_bin = pg_bin
+
+    # drop into pdb so that we can debug pageserver interactively, use pdb here
+    # For example, to interactively examine pageserver startup behavior, call
+    #   _env.pageserver.stop(immediate=True)
+    #   _env.pageserver.start()
+    # from the pdb shell.
+    pdb.set_trace()
+
+
+def setup_env(
+    neon_env_builder: NeonEnvBuilder,
+    n_tenants: int,
+) -> NeonEnv:
+    def setup_template(env: NeonEnv):
+        # create our template tenant
+        config = {
+            "gc_period": "0s",
+            "checkpoint_timeout": "10 years",
+            "compaction_period": "20 s",
+            "compaction_threshold": 10,
+            "compaction_target_size": 134217728,
+            "checkpoint_distance": 268435456,
+            "image_creation_threshold": 3,
+        }
+        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+        env.pageserver.tenant_detach(template_tenant)
+        env.pageserver.allowed_errors.append(
+            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            ".*Dropped remote consistent LSN updates.*",
+        )
+        env.pageserver.tenant_attach(template_tenant, config)
+        ep = env.endpoints.create_start("main", tenant_id=template_tenant)
+        ep.safe_psql("create table foo(b text)")
+        for _ in range(0, 8):
+            ep.safe_psql("insert into foo(b) values ('some text')")
+            last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
+        ep.stop_and_destroy()
+        return (template_tenant, template_timeline, config)
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(f"many-small-tenants-{n_tenants}", doit)
+
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+
+    return env
diff --git a/test_runner/performance/pageserver/pagebench/__init__.py b/test_runner/performance/pageserver/pagebench/__init__.py
new file mode 100644
index 0000000000..9f5e45c0a0
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/__init__.py
@@ -0,0 +1,10 @@
+"""
+Pagebench-based performance regression tests.
+
+The defining characteristic of tests in this sub-directory is that they
+are component-level tests, i.e., they exercise pageserver directly using `pagebench`
+instead of benchmarking the full stack.
+
+See https://github.com/neondatabase/neon/issues/5771
+for the context in which this was developed.
+"""
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
new file mode 100644
index 0000000000..1ed7e577b9
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -0,0 +1,210 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import fixtures.pageserver.many_tenants as many_tenants
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    wait_for_last_flush_lsn,
+)
+from fixtures.utils import get_scale_for_db, humantime_to_ms
+
+from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+
+
+# For reference, the space usage of the snapshots:
+# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
+# 137G    /instance_store/test_output/shared-snapshots
+# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
+# 1.8G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
+# 1.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
+# 8.5G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
+# 5.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
+# 76G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
+# 46G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
+@pytest.mark.parametrize("n_tenants", [1, 10])
+@pytest.mark.timeout(
+    10000
+)  # TODO: this value is just "a really high number"; have this per instance type
+def test_pageserver_max_throughput_getpage_at_latest_lsn(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(
+            metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
+        )
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            "n_tenants": (n_tenants, {"unit": ""}),
+            "pgbench_scale": (pgbench_scale, {"unit": ""}),
+            "duration": (duration, {"unit": "s"}),
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+    env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
+    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
+
+
+def run_benchmark_max_throughput_latest_lsn(
+    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
+):
+    """
+    Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
+    """
+
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "get-page-latest-lsn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--runtime",
+        f"{duration_secs}s",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+
+    metric = "request_count"
+    record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "latency_mean"
+    record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+
+def setup_pageserver_with_pgbench_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    n_tenants: int,
+    scale: int,
+) -> NeonEnv:
+    """
+    Utility function to set up a pageserver with a given number of identical tenants.
+    Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
+    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
+    """
+
+    def setup_template(env: NeonEnv):
+        # use a config that makes production of on-disk state timing-insensitive
+        # as we ingest data into the tenant.
+        config = {
+            "gc_period": "0s",  # disable periodic gc
+            "checkpoint_timeout": "10 years",
+            "compaction_period": "0s",  # disable periodic compaction
+            "compaction_threshold": 10,
+            "compaction_target_size": 134217728,
+            "checkpoint_distance": 268435456,
+            "image_creation_threshold": 3,
+        }
+        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+        env.pageserver.tenant_detach(template_tenant)
+        env.pageserver.allowed_errors.append(
+            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            ".*Dropped remote consistent LSN updates.*",
+        )
+        env.pageserver.tenant_attach(template_tenant, config)
+        ps_http = env.pageserver.http_client()
+        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+            pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
+            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+            ps_http.timeline_checkpoint(template_tenant, template_timeline)
+            ps_http.timeline_compact(template_tenant, template_timeline)
+            for _ in range(
+                0, 17
+            ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
+                # the L0s produced by this appear to have size ~5MiB
+                num_txns = 10_000
+                pg_bin.run_capture(
+                    ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
+                )
+                wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+                ps_http.timeline_checkpoint(template_tenant, template_timeline)
+                ps_http.timeline_compact(template_tenant, template_timeline)
+        # for reference, the output at scale=6 looked like so (306M total)
+        # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
+        # total 306M
+        # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
+        # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
+        #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
+        #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
+        #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
+        # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
+        # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
+        # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
+        # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
+        # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
+
+        return (template_tenant, template_timeline, config)
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(
+        f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
+    )
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+    return env
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
new file mode 100644
index 0000000000..45eb652362
--- /dev/null
+++ b/test_runner/performance/pageserver/util.py
@@ -0,0 +1,29 @@
+"""
+Utilities used by all code in this sub-directory
+"""
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pageserver.utils import wait_until_all_tenants_state
+
+
+def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
+    """
+    Helper function.
+    """
+    ps_http = env.pageserver.http_client()
+
+    log.info("wait for all tenants to become active")
+    wait_until_all_tenants_state(
+        ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
+    )
+
+    # ensure all layers are resident for predictiable performance
+    tenants = [info["id"] for info in ps_http.tenant_list()]
+    for tenant in tenants:
+        for timeline in ps_http.tenant_status(tenant)["timelines"]:
+            info = ps_http.layer_map_info(tenant, timeline)
+            for layer in info.historic_layers:
+                assert not layer.remote
+
+    log.info("ready")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index f9d6d0a934..d5d70951be 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -7,11 +7,13 @@ from typing import List, Optional
 
 import pytest
 import toml
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgBin,
 )
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -136,6 +138,7 @@ def test_create_snapshot(
     for sk in env.safekeepers:
         sk.stop()
     env.pageserver.stop()
+    env.attachment_service.stop()
 
     # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
     compatibility_snapshot_dir = (
@@ -224,11 +227,17 @@ def test_forward_compatibility(
 
     try:
         neon_env_builder.num_safekeepers = 3
+        neon_local_binpath = neon_env_builder.neon_binpath
         env = neon_env_builder.from_repo_dir(
             compatibility_snapshot_dir / "repo",
             neon_binpath=compatibility_neon_bin,
             pg_distrib_dir=compatibility_postgres_distrib_dir,
         )
+
+        # Use current neon_local even though we're using old binaries for
+        # everything else: our test code is written for latest CLI args.
+        env.neon_local_binpath = neon_local_binpath
+
         neon_env_builder.start()
 
         check_neon_works(
@@ -269,14 +278,20 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
     timeline_id = env.initial_timeline
     pg_version = env.pg_version
 
-    # Delete all files from local_fs_remote_storage except initdb.tar.zst,
+    try:
+        pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
+    except PageserverApiException as e:
+        # Allow the error as we might be running the old pageserver binary
+        log.info(f"Got allowed error: '{e}'")
+
+    # Delete all files from local_fs_remote_storage except initdb-preserved.tar.zst,
     # the file is required for `timeline_create` with `existing_initdb_timeline_id`.
     #
     # TODO: switch to Path.walk() in Python 3.12
     # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk():
     for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"):
         for filename in filenames:
-            if filename != "initdb.tar.zst":
+            if filename != "initdb-preserved.tar.zst" and filename != "initdb.tar.zst":
                 (Path(dirpath) / filename).unlink()
 
     timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 01aeb88bca..7174487e68 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -248,8 +248,15 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
     # We don't have compute_ctl, so here, so create neon_superuser here manually
     cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE")
 
-    with pytest.raises(psycopg2.InternalError):
-        cur.execute("ALTER ROLE neon_superuser LOGIN")
+    # Contrary to popular belief, being superman does not make you superuser
+    cur.execute("CREATE ROLE superman LOGIN NOSUPERUSER PASSWORD 'jungle_man'")
+
+    with ddl.pg.cursor(user="superman", password="jungle_man") as superman_cur:
+        # We allow real SUPERUSERs to ALTER neon_superuser
+        with pytest.raises(psycopg2.InternalError):
+            superman_cur.execute("ALTER ROLE neon_superuser LOGIN")
+
+    cur.execute("ALTER ROLE neon_superuser LOGIN")
 
     with pytest.raises(psycopg2.InternalError):
         cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser")
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 70c3b77516..6a4f0edbea 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -2,7 +2,7 @@ import enum
 import time
 from collections import Counter
 from dataclasses import dataclass
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Iterable, Tuple
 
 import pytest
 import toml
@@ -121,17 +121,7 @@ class EvictionEnv:
         }
 
     def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]:
-        ret: Counter[TenantId] = Counter()
-
-        for tenant_id, timeline_id in self.timelines:
-            timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
-            assert timeline_dir.exists()
-            for file in timeline_dir.iterdir():
-                if "__" not in file.name:
-                    continue
-                ret[tenant_id] += 1
-
-        return dict(ret)
+        return count_layers_per_tenant(pageserver, self.timelines)
 
     def warm_up_tenant(self, tenant_id: TenantId):
         """
@@ -199,6 +189,22 @@ class EvictionEnv:
         wait_until(10, 1, statvfs_called)
 
 
+def count_layers_per_tenant(
+    pageserver: NeonPageserver, timelines: Iterable[Tuple[TenantId, TimelineId]]
+) -> Dict[TenantId, int]:
+    ret: Counter[TenantId] = Counter()
+
+    for tenant_id, timeline_id in timelines:
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        assert timeline_dir.exists()
+        for file in timeline_dir.iterdir():
+            if "__" not in file.name:
+                continue
+            ret[tenant_id] += 1
+
+    return dict(ret)
+
+
 def human_bytes(amt: float) -> str:
     suffixes = ["", "Ki", "Mi", "Gi"]
 
@@ -243,21 +249,7 @@ def _eviction_env(
 
     timelines = []
     for scale in pgbench_scales:
-        tenant_id, timeline_id = env.neon_cli.create_tenant(
-            conf={
-                "gc_period": "0s",
-                "compaction_period": "0s",
-                "checkpoint_distance": f"{layer_size}",
-                "image_creation_threshold": "100",
-                "compaction_target_size": f"{layer_size}",
-            }
-        )
-
-        with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-            pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-        timelines.append((tenant_id, timeline_id))
+        timelines.append(pgbench_init_tenant(layer_size, scale, env, pg_bin))
 
     # stop the safekeepers to avoid on-demand downloads caused by
     # initial logical size calculation triggered by walreceiver connection status
@@ -266,25 +258,13 @@ def _eviction_env(
 
     # after stopping the safekeepers, we know that no new WAL will be coming in
     for tenant_id, timeline_id in timelines:
-        pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
-
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-        tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
-        assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
-        assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
-        pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"])
-
-        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
-        log.info(f"{layers}")
-        assert (
-            len(layers.historic_layers) >= 10
-        ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+        pgbench_init_lsns[tenant_id] = finish_tenant_creation(env, tenant_id, timeline_id, 10)
 
     eviction_env = EvictionEnv(
         timelines=timelines,
         neon_env=env,
-        pageserver_http=pageserver_http,
+        # this last tenant http client works for num_pageservers=1
+        pageserver_http=env.get_tenant_pageserver(timelines[-1][0]).http_client(),
         layer_size=layer_size,
         pg_bin=pg_bin,
         pgbench_init_lsns=pgbench_init_lsns,
@@ -293,6 +273,49 @@ def _eviction_env(
     return eviction_env
 
 
+def pgbench_init_tenant(
+    layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin
+) -> Tuple[TenantId, TimelineId]:
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{layer_size}",
+            "image_creation_threshold": "100",
+            "compaction_target_size": f"{layer_size}",
+        }
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    return (tenant_id, timeline_id)
+
+
+def finish_tenant_creation(
+    env: NeonEnv,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    min_expected_layers: int,
+) -> Lsn:
+    pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+    tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
+    assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
+    assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
+    pgbench_init_lsn = Lsn(tl_info["last_record_lsn"])
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    # log.info(f"{layers}")
+    assert (
+        len(layers.historic_layers) >= min_expected_layers
+    ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+
+    return pgbench_init_lsn
+
+
 @pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
     return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1)
@@ -598,9 +621,82 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
             assert abs_diff < 0.05
 
 
+@pytest.mark.parametrize(
+    "order",
+    [
+        EvictionOrder.ABSOLUTE_ORDER,
+        EvictionOrder.RELATIVE_ORDER_EQUAL,
+        EvictionOrder.RELATIVE_ORDER_SPARE,
+    ],
+)
+def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, order: EvictionOrder):
+    """
+    Create in order first smaller tenants and finally a single larger tenant.
+    Assert that with relative order modes, the disk usage based eviction is
+    more fair towards the smaller tenants.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
+
+    # initial_tenant and initial_timeline do not exist
+
+    # create N tenants the same fashion as EvictionEnv
+    layer_size = 5 * 1024**2
+    timelines = []
+    for scale in [1, 1, 1, 4]:
+        timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))
+
+    env.neon_cli.safekeeper_stop()
+
+    for (tenant_id, timeline_id), scale in timelines:
+        min_expected_layers = 4 if scale == 1 else 10
+        finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers)
+
+    tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False)
+
+    # cut 10 percent
+    response = env.pageserver.http_client().disk_usage_eviction_run(
+        {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()}
+    )
+    log.info(f"{response}")
+
+    after_tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+
+    ratios = []
+    for i, ((tenant_id, _timeline_id), _scale) in enumerate(timelines):
+        # we expect the oldest to suffer most
+        originally, after = tenant_layers[tenant_id], after_tenant_layers[tenant_id]
+        log.info(f"{i + 1}th tenant went from {originally} -> {after}")
+        ratio = after / originally
+        ratios.append(ratio)
+
+    assert (
+        len(ratios) == 4
+    ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
+    log.info(f"{ratios}")
+
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # first tenant loses most
+        assert ratios[0] <= ratios[1], "first should lose the most"
+        assert ratios[1] < ratios[2], "second should lose some"
+        assert ratios[1] < 1.0
+        assert ratios[2] <= ratios[3], "third might not lose"
+        assert ratios[3] == 1.0, "tenant created last does not lose"
+    elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
+        assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
+    elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
+        # with different layer sizes and pg versions, there are different combinations
+        assert len([x for x in ratios if x < 1.0]) >= 2, "require 2..4 tenants to lose layers"
+        assert ratios[3] < 1.0, "largest tenant always loses layers"
+    else:
+        raise RuntimeError(f"unimplemented {order}")
+
+
 def poor_mans_du(
     env: NeonEnv,
-    timelines: list[Tuple[TenantId, TimelineId]],
+    timelines: Iterable[Tuple[TenantId, TimelineId]],
     pageserver: NeonPageserver,
     verbose: bool = False,
 ) -> Tuple[int, int, int]:
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index faedf5d944..3519cbbaab 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -163,6 +163,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
     assert endpoint.safe_psql("select count(*) from t") == [(300000,)]
 
+    vanilla_pg.stop()
+
 
 def test_import_from_pageserver_small(
     pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path
diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py
new file mode 100644
index 0000000000..121fa91f66
--- /dev/null
+++ b/test_runner/regress/test_migrations.py
@@ -0,0 +1,37 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_migrations(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_migrations", "empty")
+
+    endpoint = env.endpoints.create("test_migrations")
+    log_path = endpoint.endpoint_path() / "compute.log"
+
+    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    endpoint.start()
+
+    time.sleep(1)  # Sleep to let migrations run
+
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT id FROM neon_migration.migration_id")
+        migration_id = cur.fetchall()
+        assert migration_id[0][0] == 2
+
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        assert "INFO handle_migrations: Ran 2 migrations" in logs
+
+    endpoint.stop()
+    endpoint.start()
+    time.sleep(1)  # Sleep to let migrations run
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT id FROM neon_migration.migration_id")
+        migration_id = cur.fetchall()
+        assert migration_id[0][0] == 2
+
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        assert "INFO handle_migrations: Ran 0 migrations" in logs
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 46b72fbca5..8edba49b8a 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail(
     env.neon_cli.endpoint_stop("ep1")
     # ep1 is stopped so create ep2 will succeed
     env.neon_cli.endpoint_start("ep2")
+    # cleanup
+    env.neon_cli.endpoint_stop("ep2")
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
new file mode 100644
index 0000000000..6be7c114cb
--- /dev/null
+++ b/test_runner/regress/test_neon_superuser.py
@@ -0,0 +1,34 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion
+
+
+def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_neon_superuser", "empty")
+    endpoint = env.endpoints.create("test_neon_superuser")
+    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    endpoint.start()
+
+    time.sleep(1)  # Sleep to let migrations run
+
+    with endpoint.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+    with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
+        assert cur.fetchall()[0][0]
+        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
+        assert cur.fetchall()[0][0]
+
+        if pg_version == PgVersion.V16:
+            cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'set')")
+            assert cur.fetchall()[0][0]
+
+        cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
+        cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index da2580dbf9..e880445c4d 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -203,6 +203,16 @@ def test_import_at_2bil(
         $$;
         """
     )
+
+    # Also create a multi-XID with members past the 2 billion mark
+    conn2 = endpoint.connect()
+    cur2 = conn2.cursor()
+    cur.execute("INSERT INTO t VALUES ('x')")
+    cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
+    cur.execute("COMMIT")
+    cur2.execute("COMMIT")
+
     # A checkpoint writes a WAL record with xl_xid=0. Many other WAL
     # records would have the same effect.
     cur.execute("checkpoint")
@@ -217,4 +227,4 @@ def test_import_at_2bil(
     conn = endpoint.connect()
     cur = conn.cursor()
     cur.execute("SELECT count(*) from t")
-    assert cur.fetchone() == (10000 + 1,)
+    assert cur.fetchone() == (10000 + 1 + 1,)
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 63f6130af5..725ed63d1c 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -499,7 +499,8 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     # and serve clients.
     env.pageserver.stop()  # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP
     env.pageserver.start(
-        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",)
+        overrides=("--pageserver-config-override=control_plane_emergency_mode=true",),
+        register=False,
     )
 
     # The pageserver should provide service to clients
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
new file mode 100644
index 0000000000..aecfcdd262
--- /dev/null
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -0,0 +1,42 @@
+import threading
+import time
+from contextlib import closing
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, PgBin
+
+
+# Test updating neon.pageserver_connstring setting on the fly.
+#
+# This merely changes some whitespace in the connection string, so
+# this doesn't prove that the new string actually takes effect. But at
+# least the code gets exercised.
+def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pageserver_restarts")
+    endpoint = env.endpoints.create_start("test_pageserver_restarts")
+    n_reconnects = 1000
+    timeout = 0.01
+    scale = 10
+
+    def run_pgbench(connstr: str):
+        log.info(f"Start a pgbench workload on pg {connstr}")
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr])
+
+    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
+    thread.start()
+
+    with closing(endpoint.connect()) as con:
+        with con.cursor() as c:
+            c.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+            connstring = c.fetchall()[0][0]
+            for i in range(n_reconnects):
+                time.sleep(timeout)
+                c.execute(
+                    "alter system set neon.pageserver_connstring=%s",
+                    (connstring + (" " * (i % 2)),),
+                )
+                c.execute("select pg_reload_conf()")
+
+    thread.join()
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index c4499196b5..753898f747 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -1,4 +1,6 @@
+import random
 from contextlib import closing
+from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
@@ -141,18 +143,24 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 # Test that repeatedly kills and restarts the page server, while the
 # safekeeper and compute node keep running.
 @pytest.mark.timeout(540)
-def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
+@pytest.mark.parametrize("shard_count", [None, 4])
+def test_pageserver_chaos(
+    neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
+):
     if build_type == "debug":
         pytest.skip("times out in debug builds")
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
     # these can happen, if we shutdown at a good time. to be fixed as part of #5172.
     message = ".*duplicated L1 layer layer=.*"
-    env.pageserver.allowed_errors.append(message)
+    for ps in env.pageservers:
+        ps.allowed_errors.append(message)
 
     # Use a tiny checkpoint distance, to create a lot of layers quickly.
     # That allows us to stress the compaction and layer flushing logic more.
@@ -192,13 +200,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
             log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
             assert int(row[0]) < int(row[1])
 
+    # We run "random" kills using a fixed seed, to improve reproducibility if a test
+    # failure is related to a particular order of operations.
+    seed = 0xDEADBEEF
+    rng = random.Random(seed)
+
     # Update the whole table, then immediately kill and restart the pageserver
     for i in range(1, 15):
         endpoint.safe_psql("UPDATE foo set updates = updates + 1")
 
         # This kills the pageserver immediately, to simulate a crash
-        env.pageserver.stop(immediate=True)
-        env.pageserver.start()
+        to_kill = rng.choice(env.pageservers)
+        to_kill.stop(immediate=True)
+        to_kill.start()
 
         # Check that all the updates are visible
         num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0]
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 521b96779a..293152dd62 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -5,7 +5,10 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
-from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
+from fixtures.pageserver.utils import (
+    assert_prefix_empty,
+    tenant_delete_wait_completed,
+)
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
@@ -135,6 +138,16 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
             pageserver.stop()
             pageserver.start()
             if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
+                # /re-attach call will bump generation: track that in our state in case we do an
+                # "attach in same generation" operation later
+                assert last_state_ps[1] is not None  # latest_attached == pageserfer.id implies this
+                # The re-attach API increments generation by exactly one.
+                new_generation = last_state_ps[1] + 1
+                last_state[pageserver.id] = (last_state_ps[0], new_generation)
+                tenants = pageserver.http_client().tenant_list()
+                assert len(tenants) == 1
+                assert tenants[0]["generation"] == new_generation
+
                 log.info("Entering postgres...")
                 workload.churn_rows(rng.randint(128, 256), pageserver.id)
                 workload.validate(pageserver.id)
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index f26d04e2f3..e4219ec7a6 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -2,25 +2,40 @@
 # This file runs pg_regress-based tests.
 #
 from pathlib import Path
+from typing import Optional
 
-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    check_restored_datadir_content,
+)
+from fixtures.remote_storage import s3_storage
 
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_pg_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    """
+    :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
+                        many shards.
+    """
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_pg_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_pg_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
@@ -61,22 +76,25 @@ def test_pg_regress(
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
 #
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_isolation(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_isolation", "empty")
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
-    endpoint = env.endpoints.create_start(
-        "test_isolation", config_lines=["max_prepared_transactions=100"]
-    )
+    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
     endpoint.safe_psql("CREATE DATABASE isolation_regression")
 
     # Create some local directories for pg_isolation_regress to run in.
@@ -114,19 +132,24 @@ def test_isolation(
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
+@pytest.mark.parametrize("shard_count", [None, 4])
 def test_sql_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     pg_bin,
     capsys,
     base_dir: Path,
     pg_distrib_dir: Path,
+    shard_count: Optional[int],
 ):
-    env = neon_simple_env
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
 
-    env.neon_cli.create_branch("test_sql_regress", "empty")
     # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_sql_regress")
+    endpoint = env.endpoints.create_start("main")
     endpoint.safe_psql("CREATE DATABASE regression")
 
     # Create some local directories for pg_regress to run in.
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
new file mode 100644
index 0000000000..c16bfc2ec6
--- /dev/null
+++ b/test_runner/regress/test_sharding.py
@@ -0,0 +1,85 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+from fixtures.remote_storage import s3_storage
+from fixtures.types import TimelineId
+from fixtures.workload import Workload
+
+
+def test_sharding_smoke(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test the basic lifecycle of a sharded tenant:
+     - ingested data gets split up
+     - page service reads
+     - timeline creation and deletion
+     - splits
+    """
+
+    shard_count = 4
+    neon_env_builder.num_pageservers = shard_count
+
+    # 1MiB stripes: enable getting some meaningful data distribution without
+    # writing large quantities of data in this test.  The stripe size is given
+    # in number of 8KiB pages.
+    stripe_size = 128
+
+    # Use S3-compatible remote storage so that we can scrub: this test validates
+    # that the scrubber doesn't barf when it sees a sharded tenant.
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+    neon_env_builder.enable_scrub_on_exit()
+
+    neon_env_builder.preserve_database_files = True
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+    tenant_id = env.initial_tenant
+
+    pageservers = dict((int(p.id), p) for p in env.pageservers)
+    shards = env.attachment_service.locate(tenant_id)
+
+    def get_sizes():
+        sizes = {}
+        for shard in shards:
+            node_id = int(shard["node_id"])
+            pageserver = pageservers[node_id]
+            sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[
+                "current_physical_size"
+            ]
+        log.info(f"sizes = {sizes}")
+        return sizes
+
+    # Test that timeline creation works on a sharded tenant
+    timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id)
+
+    # Test that we can write data to a sharded tenant
+    workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b")
+    workload.init()
+
+    sizes_before = get_sizes()
+    workload.write_rows(256)
+
+    # Test that we can read data back from a sharded tenant
+    workload.validate()
+
+    # Validate that the data is spread across pageservers
+    sizes_after = get_sizes()
+    # Our sizes increased when we wrote data
+    assert sum(sizes_after.values()) > sum(sizes_before.values())
+    # That increase is present on all shards
+    assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers)
+
+    # Validate that timeline list API works properly on all shards
+    for shard in shards:
+        node_id = int(shard["node_id"])
+        pageserver = pageservers[node_id]
+        timelines = set(
+            TimelineId(tl["timeline_id"])
+            for tl in pageserver.http_client().timeline_list(shard["shard_id"])
+        )
+        assert timelines == {env.initial_timeline, timeline_b}
+
+    # TODO: test timeline deletion and tenant deletion (depends on change in attachment_service)
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 7a5b1c0fc2..b4e5a550f3 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -411,9 +411,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
     pageserver_http.configure_failpoints((failpoint, "pause"))
 
     def hit_pausable_failpoint_and_later_fail():
-        with pytest.raises(
-            PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn"
-        ):
+        with pytest.raises(PageserverApiException, match="NotFound: tenant"):
             pageserver_http.timeline_create(
                 env.pg_version, env.initial_tenant, env.initial_timeline
             )
@@ -443,8 +441,8 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
     try:
         wait_until(10, 1, has_hit_failpoint)
 
-        # it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated
-        # then deletion should fail and set the tenant broken
+        # it should start ok, sync up with the stuck creation, then hang waiting for the timeline
+        # to shut down.
         deletion = Thread(target=start_deletion)
         deletion.start()
 
@@ -573,11 +571,16 @@ def test_tenant_delete_races_timeline_creation(
     ps_http = env.pageserver.http_client()
     tenant_id = env.initial_tenant
 
-    # Sometimes it ends with "InternalServerError(Cancelled", sometimes with "InternalServerError(Operation was cancelled"
+    # When timeline creation is cancelled by tenant deletion, it is during Tenant::shutdown(), and
+    # acting on a shutdown tenant generates a 503 response (if caller retried they would later) get
+    # a 404 after the tenant is fully deleted.
     CANCELLED_ERROR = (
-        ".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
+        ".*POST.*Cancelled request finished successfully status=503 Service Unavailable"
     )
 
+    # This can occur sometimes.
+    CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
+
     env.pageserver.allowed_errors.extend(
         [
             # lucky race with stopping from flushing a layer we fail to schedule any uploads
@@ -586,6 +589,9 @@ def test_tenant_delete_races_timeline_creation(
             ".*POST.*/timeline.* request was dropped before completing",
             # Timeline creation runs into this error
             CANCELLED_ERROR,
+            # Timeline deletion can run into this error during deletion
+            CONFLICT_MESSAGE,
+            ".*tenant_delete_handler.*still waiting, taking longer than expected.*",
         ]
     )
 
@@ -621,7 +627,10 @@ def test_tenant_delete_races_timeline_creation(
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
 
     def tenant_delete():
-        ps_http.tenant_delete(tenant_id)
+        def tenant_delete_inner():
+            ps_http.tenant_delete(tenant_id)
+
+        wait_until(100, 0.5, tenant_delete_inner)
 
     Thread(target=tenant_delete).start()
 
@@ -638,10 +647,8 @@ def test_tenant_delete_races_timeline_creation(
     ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
 
     iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-    try:
-        tenant_delete_wait_completed(ps_http, tenant_id, iterations)
-    except PageserverApiException:
-        pass
+
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
 
     # Physical deletion should have happened
     assert_prefix_empty(
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index d548e63cc1..8d5ef4e3c4 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -482,7 +482,7 @@ def test_detach_while_attaching(
     pageserver_http.tenant_detach(tenant_id)
 
     # And re-attach
-    pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
+    pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(5000)")])
 
     env.pageserver.tenant_attach(tenant_id)
 
@@ -681,7 +681,7 @@ def test_detach_while_activating(
     pageserver_http.tenant_detach(tenant_id)
 
     # And re-attach, but stop attach task_mgr task from completing
-    pageserver_http.configure_failpoints([("attach-before-activate", "return(600000)")])
+    pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(600000)")])
     env.pageserver.tenant_attach(tenant_id)
 
     # The tenant is in the Activating state.  This should not block us from
@@ -695,7 +695,7 @@ def test_detach_while_activating(
     ), "Only ignored tenant should be missing"
 
     # Subsequently attaching it again should still work
-    pageserver_http.configure_failpoints([("attach-before-activate", "off")])
+    pageserver_http.configure_failpoints([("attach-before-activate-sleep", "off")])
     env.pageserver.tenant_attach(tenant_id)
     wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 2ee2d8125a..5164bda470 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import os
 import time
 from contextlib import closing
@@ -7,6 +8,7 @@ from pathlib import Path
 from typing import List
 
 import pytest
+import requests
 from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
@@ -17,7 +19,9 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
-from fixtures.pageserver.utils import timeline_delete_wait_completed
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn, TenantId
 from fixtures.utils import wait_until
@@ -341,3 +345,78 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
     assert (
         tenant_active_count == 1
     ), f"Tenant {tenant_with_empty_timelines} should have metric as active"
+
+
+def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
+    """
+    Probabilistic stress test for the pageserver's handling of tenant requests
+    across a restart. This is intended to catch things like:
+    - Bad response codes during shutdown (e.g. returning 500 instead of 503)
+    - Issues where a tenant is still starting up while we receive a request for it
+    - Issues with interrupting/resuming tenant/timeline creation in shutdown
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+    tenant_id: TenantId = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Multiple creation requests which race will generate this error
+    env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*")
+
+    # Tenant creation requests which arrive out of order will generate complaints about
+    # generation nubmers out of order.
+    env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
+
+    # Our multiple creation requests will advance generation quickly, and when we skip
+    # a generation number we can generate these warnings
+    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")
+
+    # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
+    # an incomplete attach, or some other problem.  In the field this should be rare,
+    # so we allow it to log at WARN, even if it is occasionally a false positive.
+    env.pageserver.allowed_errors.append(".*failed to freeze and flush.*")
+
+    # When we shut down a tenant during a timeline creation, initdb is not cancelled, we wait
+    # for it to complete (since https://github.com/neondatabase/neon/pull/6451).  This means
+    # that shutdown can be delayed by >=1s on debug builds where initdb takes a long time to run.
+    env.pageserver.allowed_errors.append(".*still waiting, taking longer than expected... gate.*")
+
+    def create_bg(delay_ms):
+        time.sleep(delay_ms / 1000.0)
+        try:
+            env.pageserver.tenant_create(tenant_id=tenant_id)
+            env.pageserver.http_client().timeline_create(
+                PgVersion.NOT_SET, tenant_id, new_timeline_id=timeline_id
+            )
+        except PageserverApiException as e:
+            if e.status_code == 409:
+                log.info(f"delay_ms={delay_ms} 409")
+                pass
+            elif e.status_code == 400:
+                if "is less than existing" in e.message:
+                    # We send creation requests very close together in time: it is expected that these
+                    # race, and sometimes chigher-generation'd requests arrive first.  The pageserver rightly
+                    # rejects any attempt to make a generation number go backwards.
+                    pass
+                else:
+                    raise
+            else:
+                raise
+        except requests.exceptions.ConnectionError:
+            # Our requests might arrive during shutdown and be cut off at the transport level
+            pass
+
+    for _ in range(0, 10):
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futs = []
+            for delay_ms in (0, 1, 10, 50, 100, 200, 500, 800):
+                f = executor.submit(create_bg, delay_ms)
+                futs.append(f)
+            env.pageserver.stop()
+            env.pageserver.start()
+
+            for f in futs:
+                f.result(timeout=10)
+
+    # The tenant should end up active
+    wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 2e58a413e4..4c5cb32caa 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -868,7 +868,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     )
     assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
 
-    # Check that tenant deletion proactively wakes tenants: this is done separately to the main
+    # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
     # body of the test because it will disrupt tenant counts
     env.pageserver.stop()
     env.pageserver.start(
@@ -876,9 +876,22 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     )
 
     wait_until(10, 1, at_least_one_active)
-    delete_tenant_id = list(
+
+    detach_tenant_id = list(
         [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
     )[0][0]
+    delete_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[1][0]
+
+    # Detaching a stuck tenant should proceed promptly
+    # (reproducer for https://github.com/neondatabase/neon/pull/6430)
+    env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10)
+    tenant_ids.remove(detach_tenant_id)
+    # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level
+    env.pageserver.allowed_errors.append(
+        ".*attach failed, setting tenant state to Broken: Shut down while Attaching"
+    )
 
     # Deleting a stuck tenant should prompt it to go active
     with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -912,9 +925,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
         tenant_ids.remove(delete_tenant_id)
 
-    # Check that all the stuck tenants proceed to active (apart from the one that deletes)
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
+    # we detached)
     wait_until(10, 1, all_active)
-    assert len(get_tenant_states()) == n_tenants - 1
+    assert len(get_tenant_states()) == n_tenants - 2
 
 
 def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 7d03f644d1..97db857c74 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -137,6 +137,9 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder):
 
     ps_client = env.pageserver.http_client()
 
+    # Mark the initdb archive for preservation
+    ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id)
+
     # shut down the endpoint and delete the timeline from the pageserver
     endpoint.stop()
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 8207291128..11e970fe2b 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 82072911287cabb32018cf92c8425fa1c744def4
+Subproject commit 11e970fe2be56804f0a786ec5fc8141ffefa4ca7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index c1c2272f43..731b4d1609 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit c1c2272f436ed9231f6172f49de219fe71a9280d
+Subproject commit 731b4d1609d6db1c953755810a41e0e67ea3db7b
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 7be4a52d72..cf302768b2 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 7be4a52d728459b79b59343c57d338c3073059c8
+Subproject commit cf302768b2890569956641e0e5ba112ae1445351
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3d626cb2bc..c7b33f8c8a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "7be4a52d728459b79b59343c57d338c3073059c8",
-    "postgres-v15": "c1c2272f436ed9231f6172f49de219fe71a9280d",
-    "postgres-v14": "82072911287cabb32018cf92c8425fa1c744def4"
+    "postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
+    "postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b",
+    "postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7"
 }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index dbd46054a4..9d0f9bfcee 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -29,10 +29,9 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-dashmap = { version = "5", default-features = false, features = ["raw-api"] }
+diesel = { version = "2", features = ["postgres", "serde_json"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
-futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
@@ -74,6 +73,7 @@ tokio-rustls = { version = "0.24" }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 toml_edit = { version = "0.19", features = ["serde"] }
+tonic = { version = "0.9", features = ["tls-roots"] }
 tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
@@ -109,8 +109,10 @@ regex-automata = { version = "0.4", default-features = false, features = ["dfa-o
 regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
+toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
+toml_edit = { version = "0.19", features = ["serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }