diff --git a/.config/nextest.toml b/.config/nextest.toml
index a9398e4ab0..affdc16f31 100644
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +1,2 @@
 [profile.default]
-slow-timeout = { period = "20s", terminate-after = 3 }
+slow-timeout = { period = "60s", terminate-after = 3 }
diff --git a/.dockerignore b/.dockerignore
index f7a6232ba1..1258532db8 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index cb36e2eee6..37983798b7 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,12 +1,11 @@
 self-hosted-runner:
   labels:
     - arm64
-    - dev
     - gen3
     - large
-    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
-    - macos-14
+    - large-arm64
     - small
+    - small-arm64
     - us-east-2
 config-variables:
   - REMOTE_STORAGE_AZURE_CONTAINER
diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index dea3fc2357..9f752d5a89 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -3,13 +3,13 @@ description: 'Create Branch using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project to create Branch in'
+    description: 'ID of the Project to create Branch in'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 outputs:
   dsn:
diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml
index 8acba7ad00..58141a4a3f 100644
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project which should be deleted'
+    description: 'ID of the Project which should be deleted'
     required: true
   branch_id:
-    desctiption: 'ID of the branch to delete'
+    description: 'ID of the branch to delete'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 
 runs:
diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml
index 7f0e599b97..16759ad038 100644
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   region_id:
-    desctiption: 'Region ID, if not set the project will be created in the default region'
+    description: 'Region ID, if not set the project will be created in the default region'
     default: aws-us-east-2
   postgres_version:
-    desctiption: 'Postgres version; default is 15'
-    default: 15
+    description: 'Postgres version; default is 15'
+    default: '15'
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
   provisioner:
-    desctiption: 'k8s-pod or k8s-neonvm'
+    description: 'k8s-pod or k8s-neonvm'
     default: 'k8s-pod'
   compute_units:
-    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
     default: '[1, 1]'
 
 outputs:
diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml
index b8ec6cac70..35e165fd61 100644
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'
 
 inputs:
   api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
     required: true
   project_id:
-    desctiption: 'ID of the Project to delete'
+    description: 'ID of the Project to delete'
     required: true
   api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
     default: console-stage.neon.build
 
 runs:
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index c527cef1ac..bdf00bcaae 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -39,7 +39,7 @@ jobs:
       matrix:
         arch: [ x64, arm64 ]
 
-    runs-on: ${{ fromJson(format('["self-hosted", "dev", "{0}"]', matrix.arch)) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     env:
       IMAGE_TAG: ${{ inputs.image-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index eada65505f..f8c011a0a5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -341,6 +341,9 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
           for io_engine in std-fs tokio-epoll-uring ; do
             NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
           done
@@ -543,9 +546,27 @@ jobs:
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
+  report-benchmarks-failures:
+    needs: [ benchmarks, create-test-report ]
+    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: C060CNA47S9 # on-call-staging-storage-stream
+        slack-message: |
+          Benchmarks failed on main: ${{ github.event.head_commit.url }}
+
+          Allure report: ${{ needs.create-test-report.outputs.report-url }}
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   create-test-report:
     needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
@@ -702,9 +723,13 @@ jobs:
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
 
-  neon-image:
+  neon-image-arch:
     needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -726,12 +751,6 @@ jobs:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
       - uses: docker/build-push-action@v5
         with:
           context: .
@@ -743,25 +762,52 @@ jobs:
           push: true
           pull: true
           file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache
-          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
         if: always()
         run: |
           rm -rf .docker-custom
 
-  compute-node-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+  neon-image:
+    needs: [ neon-image-arch, tag ]
+    runs-on: ubuntu-latest
 
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
+
+  compute-node-image-arch:
+    needs: [ check-permissions, build-build-tools-image, tag ]
     strategy:
       fail-fast: false
       matrix:
         version: [ v14, v15, v16 ]
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
 
     steps:
       - name: Checkout
@@ -808,15 +854,14 @@ jobs:
           push: true
           pull: true
           file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
-        if: ${{ matrix.version == 'v16' }}
+        if: matrix.version == 'v16'
         uses: docker/build-push-action@v5
         with:
           target: compute-tools-image
@@ -830,14 +875,57 @@ jobs:
           pull: true
           file: Dockerfile.compute-node
           tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
         if: always()
         run: |
           rm -rf .docker-custom
 
+  compute-node-image:
+    needs: [ compute-node-image-arch, tag ]
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        version: [ v14, v15, v16 ]
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch compute-node image
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - name: Create multi-arch compute-tools image
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Push multi-arch compute-tools image to ECR
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
   vm-compute-node-image:
     needs: [ check-permissions, tag, compute-node-image ]
     runs-on: [ self-hosted, gen3, large ]
@@ -845,11 +933,8 @@ jobs:
       fail-fast: false
       matrix:
         version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.29.3
 
     steps:
       - name: Checkout
@@ -862,26 +947,48 @@ jobs:
           curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
       # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
       # it won't have the proper authentication (written at v0.6.0)
       - name: Pulling compute-node image
         run: |
-          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Build vm image
         run: |
           ./vm-builder \
             -spec=vm-image-spec.yaml \
-            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
 
       - name: Pushing vm-compute-node image
         run: |
-          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
 
   test-images:
     needs: [ check-permissions, tag, neon-image, compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
     steps:
       - name: Checkout
@@ -899,7 +1006,7 @@ jobs:
       - name: Verify image versions
         shell: bash # ensure no set -e for better error messages
         run: |
-          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+          pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
 
           echo "Pageserver version string: $pageserver_version"
 
@@ -925,78 +1032,48 @@ jobs:
 
   promote-images:
     needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-    # Don't add if-condition here.
-    # The job should always be run because we have dependant other jobs that shouldn't be skipped
+    runs-on: ubuntu-latest
+
+    env:
+      VERSIONS: v14 v15 v16
 
     steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
-      - name: Copy vm-compute-node images to Docker Hub
+      - name: Copy vm-compute-node images to ECR
         run: |
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
+          for version in ${VERSIONS}; do
+            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
+                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+          done
 
       - name: Add latest tag to images
-        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'main'
         run: |
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
+            docker buildx imagetools create -t $repo/neon:latest \
+                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push images to production ECR
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
+            docker buildx imagetools create -t $repo/compute-tools:latest \
+                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
 
-      - name: Configure Docker Hub login
-        run: |
-          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
-          echo "" > /github/home/.docker/config.json
-          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
+            for version in ${VERSIONS}; do
+              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
+                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
 
-      - name: Push vm-compute-node to Docker Hub
-        run: |
-          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
-
-      - name: Push latest tags to Docker Hub
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
+                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+            done
+          done
 
   trigger-custom-extensions-build-and-wait:
     needs: [ check-permissions, tag ]
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 5a2f9d6645..7d2187e59c 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
   check-linux-arm-build:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
 
     env:
       # Use release build only, to have less debug info around
@@ -232,20 +232,20 @@ jobs:
 
       - name: Run cargo build
         run: |
-          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+          mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
 
       - name: Run cargo test
         env:
           NEXTEST_RETRIES: 3
         run: |
-          cargo nextest run $CARGO_FEATURES
+          cargo nextest run $CARGO_FEATURES -j$(nproc)
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
           export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_s3
+          cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -255,12 +255,12 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          cargo nextest run --package remote_storage --test test_real_azure
+          cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
 
   check-codestyle-rust-arm:
     needs: [ check-permissions, build-build-tools-image ]
     timeout-minutes: 90
-    runs-on: [ self-hosted, dev, arm64 ]
+    runs-on: [ self-hosted, small-arm64 ]
 
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -269,6 +269,11 @@ jobs:
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+
     steps:
       - name: Fix git ownership
         run: |
@@ -305,31 +310,35 @@ jobs:
             exit 1
           fi
           echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+
       - name: Run cargo clippy (debug)
+        if: matrix.build_type == 'debug'
         run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
       - name: Run cargo clippy (release)
+        if: matrix.build_type == 'release'
         run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
 
       - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
+        if: matrix.build_type == 'release'
+        run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
         env:
             RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
 
       # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
       - name: Check formatting
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: cargo fmt --all -- --check
 
       # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
       - name: Check rust dependencies
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: |
           cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
           cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
 
       # https://github.com/EmbarkStudios/cargo-deny
       - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
+        if: ${{ !cancelled() && matrix.build_type == 'release' }}
         run: cargo deny check
 
   gather-rust-build-stats:
@@ -338,7 +347,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
       contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
       github.ref_name == 'main'
-    runs-on: [ self-hosted, gen3, large ]
+    runs-on: [ self-hosted, large ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}
       credentials:
@@ -369,7 +378,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings
+        run: cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats
diff --git a/Cargo.lock b/Cargo.lock
index 9bff5e1eff..d8f9021eb8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "ahash"
-version = "0.8.9"
+version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
+checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
  "cfg-if",
  "const-random",
@@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.1.4"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b30c39ebe61f75d1b3785362b1586b41991873c9ab3e317a9181c246fb71d82"
+checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -309,14 +309,15 @@ dependencies = [
  "time",
  "tokio",
  "tracing",
+ "url",
  "zeroize",
 ]
 
 [[package]]
 name = "aws-credential-types"
-version = "1.1.8"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8"
+checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-runtime-api",
@@ -326,9 +327,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.1.8"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b13dc54b4b49f8288532334bba8f87386a40571c47c37b1304979b556dc613c8"
+checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -373,10 +374,11 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.14.0"
+version = "1.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "951f7730f51a2155c711c85c79f337fbc02a577fa99d2a0a8059acfce5392113"
+checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d"
 dependencies = [
+ "ahash",
  "aws-credential-types",
  "aws-runtime",
  "aws-sigv4",
@@ -391,20 +393,25 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
+ "fastrand 2.0.0",
+ "hex",
+ "hmac",
  "http 0.2.9",
  "http-body 0.4.5",
+ "lru",
  "once_cell",
  "percent-encoding",
  "regex-lite",
+ "sha2",
  "tracing",
  "url",
 ]
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f486420a66caad72635bc2ce0ff6581646e0d32df02aa39dc983bfe794955a5b"
+checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -424,9 +431,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39ddccf01d82fce9b4a15c8ae8608211ee7db8ed13a70b514bbfe41df3d24841"
+checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -446,9 +453,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.12.0"
+version = "1.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a591f8c7e6a621a501b2b5d2e88e1697fcb6274264523a6ad4d5959889a41ce"
+checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -469,9 +476,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263"
+checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -498,9 +505,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "1.1.8"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d26ea8fa03025b2face2b3038a63525a10891e3d8829901d502e5384a0d8cd46"
+checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -509,9 +516,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.60.4"
+version = "0.60.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be2acd1b9c6ae5859999250ed5a62423aedc5cf69045b844432de15fa2f31f2b"
+checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -541,9 +548,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.60.7"
+version = "0.60.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9"
+checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-runtime-api",
@@ -581,9 +588,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.1.8"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec81002d883e5a7fd2bb063d6fb51c4999eb55d404f4fff3dd878bf4733b9f01"
+checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -594,6 +601,7 @@ dependencies = [
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
+ "http-body 1.0.0",
  "hyper 0.14.26",
  "hyper-rustls 0.24.0",
  "once_cell",
@@ -606,9 +614,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.2.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9acb931e0adaf5132de878f1398d83f8677f90ba70f01f65ff87f6d7244be1c5"
+checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
@@ -623,16 +631,19 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.1.8"
+version = "1.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729"
+checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1"
 dependencies = [
  "base64-simd",
  "bytes",
  "bytes-utils",
  "futures-core",
  "http 0.2.9",
+ "http 1.1.0",
  "http-body 0.4.5",
+ "http-body 1.0.0",
+ "http-body-util",
  "itoa",
  "num-integer",
  "pin-project-lite",
@@ -646,18 +657,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.60.7"
+version = "0.60.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9"
+checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "1.1.8"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0dbf2f3da841a8930f159163175cf6a3d16ddde517c1b0fba7aa776822800f40"
+checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -697,7 +708,7 @@ dependencies = [
  "sha1",
  "sync_wrapper",
  "tokio",
- "tokio-tungstenite 0.20.0",
+ "tokio-tungstenite",
  "tower",
  "tower-layer",
  "tower-service",
@@ -968,6 +979,12 @@ version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
+[[package]]
+name = "bytemuck"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
+
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -1055,9 +1072,9 @@ dependencies = [
 
 [[package]]
 name = "chrono"
-version = "0.4.31"
+version = "0.4.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
 dependencies = [
  "android-tzdata",
  "iana-time-zone",
@@ -1065,7 +1082,7 @@ dependencies = [
  "num-traits",
  "serde",
  "wasm-bindgen",
- "windows-targets 0.48.0",
+ "windows-targets 0.52.4",
 ]
 
 [[package]]
@@ -1092,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
 dependencies = [
  "ciborium-io",
- "half",
+ "half 1.8.2",
 ]
 
 [[package]]
@@ -1222,8 +1239,10 @@ dependencies = [
  "serde_json",
  "signal-hook",
  "tar",
+ "thiserror",
  "tokio",
  "tokio-postgres",
+ "tokio-stream",
  "tokio-util",
  "toml_edit",
  "tracing",
@@ -1452,26 +1471,21 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-deque"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 dependencies = [
- "cfg-if",
  "crossbeam-epoch",
  "crossbeam-utils",
 ]
 
 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.14"
+version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
- "autocfg",
- "cfg-if",
  "crossbeam-utils",
- "memoffset 0.8.0",
- "scopeguard",
 ]
 
 [[package]]
@@ -1585,7 +1599,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
  "cfg-if",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "lock_api",
  "once_cell",
  "parking_lot_core 0.9.8",
@@ -1986,6 +2000,27 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "framed-websockets"
+version = "0.1.0"
+source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
+dependencies = [
+ "base64 0.21.1",
+ "bytemuck",
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
+ "pin-project",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "fs2"
 version = "0.4.3"
@@ -2238,6 +2273,17 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
 [[package]]
 name = "hash32"
 version = "0.3.1"
@@ -2264,9 +2310,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.14.0"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
  "ahash",
  "allocator-api2",
@@ -2274,11 +2320,11 @@ dependencies = [
 
 [[package]]
 name = "hashlink"
-version = "0.8.4"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
 dependencies = [
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -2587,21 +2633,6 @@ dependencies = [
  "tokio-native-tls",
 ]
 
-[[package]]
-name = "hyper-tungstenite"
-version = "0.13.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
-dependencies = [
- "http-body-util",
- "hyper 1.2.0",
- "hyper-util",
- "pin-project-lite",
- "tokio",
- "tokio-tungstenite 0.21.0",
- "tungstenite 0.21.0",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -2679,7 +2710,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -2935,6 +2966,15 @@ version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 
+[[package]]
+name = "lru"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
+dependencies = [
+ "hashbrown 0.14.5",
+]
+
 [[package]]
 name = "match_cfg"
 version = "0.1.0"
@@ -2985,7 +3025,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
  "bytes",
  "crossbeam-utils",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "itoa",
  "lasso",
  "measured-derive",
@@ -3547,7 +3587,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
  "dlv-list",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
 ]
 
 [[package]]
@@ -3868,13 +3908,14 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
  "ahash",
  "bytes",
  "chrono",
- "hashbrown 0.14.0",
+ "half 2.4.1",
+ "hashbrown 0.14.5",
  "num",
  "num-bigint",
  "paste",
@@ -3882,12 +3923,13 @@ dependencies = [
  "thrift",
  "twox-hash",
  "zstd",
+ "zstd-sys",
 ]
 
 [[package]]
 name = "parquet_derive"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
  "parquet",
  "proc-macro2",
@@ -3914,9 +3956,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
 [[package]]
 name = "pbkdf2"
-version = "0.12.1"
+version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
 dependencies = [
  "digest",
  "hmac",
@@ -4339,6 +4381,7 @@ dependencies = [
 name = "proxy"
 version = "0.1.0"
 dependencies = [
+ "ahash",
  "anyhow",
  "async-compression",
  "async-trait",
@@ -4355,12 +4398,14 @@ dependencies = [
  "chrono",
  "clap",
  "consumption_metrics",
+ "crossbeam-deque",
  "dashmap",
  "env_logger",
  "fallible-iterator",
+ "framed-websockets",
  "futures",
  "git-version",
- "hashbrown 0.13.2",
+ "hashbrown 0.14.5",
  "hashlink",
  "hex",
  "hmac",
@@ -4370,7 +4415,6 @@ dependencies = [
  "humantime",
  "hyper 0.14.26",
  "hyper 1.2.0",
- "hyper-tungstenite",
  "hyper-util",
  "indexmap 2.0.1",
  "ipnet",
@@ -4415,7 +4459,6 @@ dependencies = [
  "smol_str",
  "socket2 0.5.5",
  "subtle",
- "sync_wrapper",
  "task-local-extensions",
  "thiserror",
  "tikv-jemalloc-ctl",
@@ -4424,6 +4467,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
+ "tokio-tungstenite",
  "tokio-util",
  "tower-service",
  "tracing",
@@ -5932,7 +5976,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"
+source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
 
 [[package]]
 name = "syn"
@@ -6360,19 +6404,7 @@ dependencies = [
  "futures-util",
  "log",
  "tokio",
- "tungstenite 0.20.1",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.21.0",
+ "tungstenite",
 ]
 
 [[package]]
@@ -6386,7 +6418,7 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "futures-util",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "pin-project-lite",
  "tokio",
  "tracing",
@@ -6668,25 +6700,6 @@ dependencies = [
  "utf-8",
 ]
 
-[[package]]
-name = "tungstenite"
-version = "0.21.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http 1.1.0",
- "httparse",
- "log",
- "rand 0.8.5",
- "sha1",
- "thiserror",
- "url",
- "utf-8",
-]
-
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7457,6 +7470,7 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
+ "ahash",
  "anyhow",
  "aws-config",
  "aws-runtime",
@@ -7482,7 +7496,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.5",
  "hex",
  "hmac",
  "hyper 0.14.26",
diff --git a/Cargo.toml b/Cargo.toml
index 1ddadd2f3c..0887c039f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ license = "Apache-2.0"
 
 ## All dependency versions, used in the project
 [workspace.dependencies]
+ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -52,14 +53,14 @@ azure_storage_blobs = "0.19"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.14"
+aws-config = { version = "1.3", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.26"
 aws-sdk-iam = "1.15.0"
-aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.1.4"
-aws-credential-types = "1.1.4"
-aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
-aws-types = "1.1.7"
+aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.1.9"
+aws-credential-types = "1.2.0"
+aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
+aws-types = "1.2.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
+crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
@@ -81,13 +83,14 @@ enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
+framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
-hashbrown = "0.13"
-hashlink = "0.8.4"
+hashbrown = "0.14"
+hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
@@ -98,7 +101,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+tokio-tungstenite = "0.20.0"
 indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
@@ -121,8 +124,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "49.0.0"
+parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
@@ -158,8 +161,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# https://github.com/nical/rust_debug/pull/4
-svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
+# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
+svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
@@ -243,8 +246,8 @@ tonic-build = "0.9"
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 
 # bug fixes for UUID
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
+parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
 
 ################# Binary contents sections
 
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 19739cc1f8..460b8c996d 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.4.0
+ENV MOLD_VERSION v2.31.0
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index bd4534ce1d..5bf3246f34 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
+    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
diff --git a/README.md b/README.md
index 00a90f4483..ea0a289502 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
+[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
+
+
 
 # Neon
 
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 759a117ee9..8f96530a9d 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
+tokio-stream.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
+thiserror.workspace = true
 url.workspace = true
 
 compute_api.workspace = true
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
new file mode 100644
index 0000000000..4fefa831e0
--- /dev/null
+++ b/compute_tools/src/catalog.rs
@@ -0,0 +1,116 @@
+use compute_api::{
+    responses::CatalogObjects,
+    spec::{Database, Role},
+};
+use futures::Stream;
+use postgres::{Client, NoTls};
+use std::{path::Path, process::Stdio, result::Result, sync::Arc};
+use tokio::{
+    io::{AsyncBufReadExt, BufReader},
+    process::Command,
+    task,
+};
+use tokio_stream::{self as stream, StreamExt};
+use tokio_util::codec::{BytesCodec, FramedRead};
+use tracing::warn;
+
+use crate::{
+    compute::ComputeNode,
+    pg_helpers::{get_existing_dbs, get_existing_roles},
+};
+
+pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
+    let connstr = compute.connstr.clone();
+    task::spawn_blocking(move || {
+        let mut client = Client::connect(connstr.as_str(), NoTls)?;
+        let roles: Vec<Role>;
+        {
+            let mut xact = client.transaction()?;
+            roles = get_existing_roles(&mut xact)?;
+        }
+        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
+
+        Ok(CatalogObjects { roles, databases })
+    })
+    .await?
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SchemaDumpError {
+    #[error("Database does not exist.")]
+    DatabaseDoesNotExist,
+    #[error("Failed to execute pg_dump.")]
+    IO(#[from] std::io::Error),
+}
+
+// It uses the pg_dump utility to dump the schema of the specified database.
+// The output is streamed back to the caller and supposed to be streamed via HTTP.
+//
+// Before return the result with the output, it checks that pg_dump produced any output.
+// If not, it tries to parse the stderr output to determine if the database does not exist
+// and special error is returned.
+//
+// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
+pub async fn get_database_schema(
+    compute: &Arc<ComputeNode>,
+    dbname: &str,
+) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
+    let pgbin = &compute.pgbin;
+    let basepath = Path::new(pgbin).parent().unwrap();
+    let pgdump = basepath.join("pg_dump");
+    let mut connstr = compute.connstr.clone();
+    connstr.set_path(dbname);
+    let mut cmd = Command::new(pgdump)
+        .arg("--schema-only")
+        .arg(connstr.as_str())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .kill_on_drop(true)
+        .spawn()?;
+
+    let stdout = cmd.stdout.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
+    })?;
+
+    let stderr = cmd.stderr.take().ok_or_else(|| {
+        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
+    })?;
+
+    let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
+    let stderr_reader = BufReader::new(stderr);
+
+    let first_chunk = match stdout_reader.next().await {
+        Some(Ok(bytes)) if !bytes.is_empty() => bytes,
+        Some(Err(e)) => {
+            return Err(SchemaDumpError::IO(e));
+        }
+        _ => {
+            let mut lines = stderr_reader.lines();
+            if let Some(line) = lines.next_line().await? {
+                if line.contains(&format!("FATAL:  database \"{}\" does not exist", dbname)) {
+                    return Err(SchemaDumpError::DatabaseDoesNotExist);
+                }
+                warn!("pg_dump stderr: {}", line)
+            }
+            tokio::spawn(async move {
+                while let Ok(Some(line)) = lines.next_line().await {
+                    warn!("pg_dump stderr: {}", line)
+                }
+            });
+
+            return Err(SchemaDumpError::IO(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "failed to start pg_dump",
+            )));
+        }
+    };
+    let initial_stream = stream::once(Ok(first_chunk.freeze()));
+    // Consume stderr and log warnings
+    tokio::spawn(async move {
+        let mut lines = stderr_reader.lines();
+        while let Ok(Some(line)) = lines.next_line().await {
+            warn!("pg_dump stderr: {}", line)
+        }
+    });
+    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 128783b477..0286429cf2 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,17 +5,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::catalog::SchemaDumpError;
+use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
 
 use anyhow::Result;
+use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
+use utils::http::request::must_get_query_param;
 
 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
     ComputeStatusResponse {
@@ -133,6 +137,34 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::GET, "/dbs_and_roles") => {
+            info!("serving /dbs_and_roles GET request",);
+            match get_dbs_and_roles(compute).await {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(_) => {
+                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
+        (&Method::GET, "/database_schema") => {
+            let database = match must_get_query_param(&req, "database") {
+                Err(e) => return e.into_response(),
+                Ok(database) => database,
+            };
+            info!("serving /database_schema GET request with database: {database}",);
+            match get_database_schema(compute, &database).await {
+                Ok(res) => render_plain(Body::wrap_stream(res)),
+                Err(SchemaDumpError::DatabaseDoesNotExist) => {
+                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
+                }
+                Err(e) => {
+                    error!("can't get schema dump: {}", e);
+                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
@@ -303,10 +335,25 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
     };
     Response::builder()
         .status(status)
+        .header(CONTENT_TYPE, "application/json")
         .body(Body::from(serde_json::to_string(&error).unwrap()))
         .unwrap()
 }
 
+fn render_json(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "application/json")
+        .body(body)
+        .unwrap()
+}
+
+fn render_plain(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "text/plain")
+        .body(body)
+        .unwrap()
+}
+
 async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
     {
         let mut state = compute.state.lock().unwrap();
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index d2ec54299f..b0ddaeae2b 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,6 +68,51 @@ paths:
               schema:
                 $ref: "#/components/schemas/Info"
 
+  /dbs_and_roles:
+    get:
+      tags:
+        - Info
+      summary: Get databases and roles in the catalog.
+      description: ""
+      operationId: getDbsAndRoles
+      responses:
+        200:
+          description: Compute schema objects
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/DbsAndRoles"
+
+  /database_schema:
+    get:
+      tags:
+        - Info
+      summary: Get schema dump
+      parameters:
+        - name: database
+          in: query
+          description: Database name to dump.
+          required: true
+          schema:
+            type: string
+          example: "postgres"
+      description: Get schema dump in SQL format.
+      operationId: getDatabaseSchema
+      responses:
+        200:
+          description: Schema dump
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Schema dump in SQL format.
+        404:
+          description: Non existing database.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /check_writability:
     post:
       tags:
@@ -229,6 +274,73 @@ components:
         num_cpus:
           type: integer
 
+    DbsAndRoles:
+      type: object
+      description: Databases and Roles
+      required:
+        - roles
+        - databases
+      properties:
+        roles:
+          type: array
+          items:
+            $ref: "#/components/schemas/Role"
+        databases:
+          type: array
+          items:
+            $ref: "#/components/schemas/Database"
+
+    Database:
+      type: object
+      description: Database
+      required:
+        - name
+        - owner
+        - restrict_conn
+        - invalid
+      properties:
+        name:
+          type: string
+        owner:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+        restrict_conn:
+          type: boolean
+        invalid:
+          type: boolean
+
+    Role:
+      type: object
+      description: Role
+      required:
+        - name
+      properties:
+        name:
+          type: string
+        encrypted_password:
+          type: string
+        options:
+          type: array
+          items:
+            $ref: "#/components/schemas/GenericOption"
+
+    GenericOption:
+      type: object
+      description: Schema Generic option
+      required:
+        - name
+        - vartype
+      properties:
+        name:
+          type: string
+        value:
+          type: string
+        vartype:
+          type: string
+
     ComputeState:
       type: object
       required:
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index eac808385c..18c228ba54 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,6 +8,7 @@ pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
+pub mod catalog;
 pub mod compute;
 pub mod extension_server;
 pub mod monitor;
diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs
index c22b6bc14e..024c5b338e 100644
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -1,3 +1,5 @@
+use std::path::Path;
+
 use anyhow::{anyhow, Context};
 use tracing::warn;
 
@@ -17,17 +19,24 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
         .arg(size_bytes.to_string())
         .spawn();
 
-    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
-        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
-        return Ok(());
-    }
-
     child_result
         .context("spawn() failed")
         .and_then(|mut child| child.wait().context("wait() failed"))
         .and_then(|status| match status.success() {
             true => Ok(()),
-            false => Err(anyhow!("process exited with {status}")),
+            false => {
+                // The command failed. Maybe it was because the resize-swap file doesn't exist?
+                // The --once flag causes it to delete itself on success so we don't disable swap
+                // while postgres is running; maybe this is fine.
+                match Path::new(RESIZE_SWAP_BIN).try_exists() {
+                    Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
+                    // The path doesn't exist; we're actually ok 
+                    Ok(false) => {
+                        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+                        Ok(())
+                    },
+                }
+            }
         })
         // wrap any prior error with the overall context that we couldn't run the command
         .with_context(|| {
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 179a756135..18e395e2b5 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,8 +9,11 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::{InitForceMode, LocalEnv};
-use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
+use control_plane::local_env::{
+    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
+    SafekeeperConf,
+};
+use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
@@ -52,44 +55,6 @@ const DEFAULT_PG_VERSION: &str = "15";
 
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 
-fn default_conf(num_pageservers: u16) -> String {
-    let mut template = format!(
-        r#"
-# Default built-in configuration, defined in main.rs
-control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
-
-[broker]
-listen_addr = '{DEFAULT_BROKER_ADDR}'
-
-[[safekeepers]]
-id = {DEFAULT_SAFEKEEPER_ID}
-pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
-http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
-
-"#,
-    );
-
-    for i in 0..num_pageservers {
-        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
-        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
-        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
-
-        template += &format!(
-            r#"
-[[pageservers]]
-id = {pageserver_id}
-listen_pg_addr = '127.0.0.1:{pg_port}'
-listen_http_addr = '127.0.0.1:{http_port}'
-pg_auth_type = '{trust_auth}'
-http_auth_type = '{trust_auth}'
-"#,
-            trust_auth = AuthType::Trust,
-        )
-    }
-
-    template
-}
-
 ///
 /// Timelines tree element used as a value in the HashMap.
 ///
@@ -152,7 +117,7 @@ fn main() -> Result<()> {
     };
 
     match subcommand_result {
-        Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
+        Ok(Some(updated_env)) => updated_env.persist_config()?,
         Ok(None) => (),
         Err(e) => {
             eprintln!("command failed: {e:?}");
@@ -341,55 +306,65 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }
 
 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let num_pageservers = init_match
-        .get_one::<u16>("num-pageservers")
-        .expect("num-pageservers arg has a default");
-    // Create config file
-    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
+    let num_pageservers = init_match.get_one::<u16>("num-pageservers");
+
+    let force = init_match.get_one("force").expect("we set a default value");
+
+    // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
+    let init_conf: NeonLocalInitConf = if let Some(config_path) =
+        init_match.get_one::<PathBuf>("config")
+    {
+        // User (likely the Python test suite) provided a description of the environment.
+        if num_pageservers.is_some() {
+            bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
+        }
         // load and parse the file
-        std::fs::read_to_string(config_path).with_context(|| {
+        let contents = std::fs::read_to_string(config_path).with_context(|| {
             format!(
                 "Could not read configuration file '{}'",
                 config_path.display()
             )
-        })?
+        })?;
+        toml_edit::de::from_str(&contents)?
     } else {
-        // Built-in default config
-        default_conf(*num_pageservers)
+        // User (likely interactive) did not provide a description of the environment, give them the default
+        NeonLocalInitConf {
+            control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
+            broker: NeonBroker {
+                listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
+            },
+            safekeepers: vec![SafekeeperConf {
+                id: DEFAULT_SAFEKEEPER_ID,
+                pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
+                http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
+                ..Default::default()
+            }],
+            pageservers: (0..num_pageservers.copied().unwrap_or(1))
+                .map(|i| {
+                    let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+                    let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+                    let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+                    NeonLocalInitPageserverConf {
+                        id: pageserver_id,
+                        listen_pg_addr: format!("127.0.0.1:{pg_port}"),
+                        listen_http_addr: format!("127.0.0.1:{http_port}"),
+                        pg_auth_type: AuthType::Trust,
+                        http_auth_type: AuthType::Trust,
+                        other: Default::default(),
+                    }
+                })
+                .collect(),
+            pg_distrib_dir: None,
+            neon_distrib_dir: None,
+            default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
+            storage_controller: None,
+            control_plane_compute_hook_api: None,
+        }
     };
 
-    let pageserver_config: toml_edit::Document =
-        if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
-            std::fs::read_to_string(path)?.parse()?
-        } else {
-            toml_edit::Document::new()
-        };
-
-    let pg_version = init_match
-        .get_one::<u32>("pg-version")
-        .copied()
-        .context("Failed to parse postgres version from the argument string")?;
-
-    let mut env =
-        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_one("force").expect("we set a default value");
-    env.init(pg_version, force)
-        .context("Failed to initialize neon repository")?;
-
-    // Create remote storage location for default LocalFs remote storage
-    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
-
-    // Initialize pageserver, create initial tenant and timeline.
-    for ps_conf in &env.pageservers {
-        PageServerNode::from_env(&env, ps_conf)
-            .initialize(pageserver_config.clone())
-            .unwrap_or_else(|e| {
-                eprintln!("pageserver init failed: {e:?}");
-                exit(1);
-            });
-    }
-
-    Ok(env)
+    LocalEnv::init(init_conf, force)
+        .context("materialize initial neon_local environment on disk")?;
+    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
 }
 
 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -1418,9 +1393,7 @@ fn cli() -> Command {
     let num_pageservers_arg = Arg::new("num-pageservers")
         .value_parser(value_parser!(u16))
         .long("num-pageservers")
-        .help("How many pageservers to create (default 1)")
-        .required(false)
-        .default_value("1");
+        .help("How many pageservers to create (default 1)");
 
     let update_catalog = Arg::new("update-catalog")
         .value_parser(value_parser!(bool))
@@ -1454,14 +1427,6 @@ fn cli() -> Command {
                         .value_parser(value_parser!(PathBuf))
                         .value_name("config")
                 )
-                .arg(
-                    Arg::new("pageserver-config")
-                        .long("pageserver-config")
-                        .required(false)
-                        .value_parser(value_parser!(PathBuf))
-                        .value_name("pageserver-config")
-                        .help("Merge the provided pageserver config into the one generated by neon_local."),
-                )
                 .arg(pg_version_arg.clone())
                 .arg(force_arg)
         )
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 7abbbce95a..0edcf1be4e 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -3,7 +3,7 @@
 //! Now it also provides init method which acts like a stub for proper installation
 //! script which will use local paths.
 
-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 
 use clap::ValueEnum;
 use postgres_backend::AuthType;
@@ -23,6 +23,8 @@ use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
 };
 
+use crate::pageserver::PageServerNode;
+use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;
 
 pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -34,7 +36,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[derive(PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
     // Base directory for all the nodes (the pageserver, safekeepers and
     // compute endpoints).
@@ -42,59 +44,99 @@ pub struct LocalEnv {
     // This is not stored in the config file. Rather, this is the path where the
     // config file itself is. It is read from the NEON_REPO_DIR env variable or
     // '.neon' if not given.
-    #[serde(skip)]
     pub base_data_dir: PathBuf,
 
     // Path to postgres distribution. It's expected that "bin", "include",
     // "lib", "share" from postgres distribution are there. If at some point
     // in time we will be able to run against vanilla postgres we may split that
     // to four separate paths and match OS-specific installation layout.
-    #[serde(default)]
     pub pg_distrib_dir: PathBuf,
 
     // Path to pageserver binary.
-    #[serde(default)]
     pub neon_distrib_dir: PathBuf,
 
     // Default tenant ID to use with the 'neon_local' command line utility, when
     // --tenant_id is not explicitly specified.
-    #[serde(default)]
     pub default_tenant_id: Option<TenantId>,
 
     // used to issue tokens during e.g pg start
-    #[serde(default)]
     pub private_key_path: PathBuf,
 
     pub broker: NeonBroker,
 
     // Configuration for the storage controller (1 per neon_local environment)
-    #[serde(default)]
     pub storage_controller: NeonStorageControllerConf,
 
     /// This Vec must always contain at least one pageserver
+    /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
+    /// NB: not used anymore except for informing users that they need to change their `.neon/config`.
     pub pageservers: Vec<PageServerConf>,
 
-    #[serde(default)]
     pub safekeepers: Vec<SafekeeperConf>,
 
     // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
-    #[serde(default)]
     pub control_plane_api: Option<Url>,
 
     // Control plane upcall API for storage controller.  If set, this will be propagated into the
     // storage controller's configuration.
-    #[serde(default)]
     pub control_plane_compute_hook_api: Option<Url>,
 
     /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
-    #[serde(default)]
     // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
     // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
     // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
+    pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
+}
+
+/// On-disk state stored in `.neon/config`.
+#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
+#[serde(default, deny_unknown_fields)]
+pub struct OnDiskConfig {
+    pub pg_distrib_dir: PathBuf,
+    pub neon_distrib_dir: PathBuf,
+    pub default_tenant_id: Option<TenantId>,
+    pub private_key_path: PathBuf,
+    pub broker: NeonBroker,
+    pub storage_controller: NeonStorageControllerConf,
+    #[serde(
+        skip_serializing,
+        deserialize_with = "fail_if_pageservers_field_specified"
+    )]
+    pub pageservers: Vec<PageServerConf>,
+    pub safekeepers: Vec<SafekeeperConf>,
+    pub control_plane_api: Option<Url>,
+    pub control_plane_compute_hook_api: Option<Url>,
     branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }
 
+fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    Err(serde::de::Error::custom(
+        "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
+         Please remove the `pageservers` from your .neon/config.",
+    ))
+}
+
+/// The description of the neon_local env to be initialized by `neon_local init --config`.
+#[derive(Clone, Debug, Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct NeonLocalInitConf {
+    // TODO: do we need this? Seems unused
+    pub pg_distrib_dir: Option<PathBuf>,
+    // TODO: do we need this? Seems unused
+    pub neon_distrib_dir: Option<PathBuf>,
+    pub default_tenant_id: TenantId,
+    pub broker: NeonBroker,
+    pub storage_controller: Option<NeonStorageControllerConf>,
+    pub pageservers: Vec<NeonLocalInitPageserverConf>,
+    pub safekeepers: Vec<SafekeeperConf>,
+    pub control_plane_api: Option<Option<Url>>,
+    pub control_plane_compute_hook_api: Option<Option<Url>>,
+}
+
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -110,6 +152,9 @@ pub struct NeonStorageControllerConf {
     /// Heartbeat timeout before marking a node offline
     #[serde(with = "humantime_serde")]
     pub max_unavailable: Duration,
+
+    /// Threshold for auto-splitting a tenant into shards
+    pub split_threshold: Option<u64>,
 }
 
 impl NeonStorageControllerConf {
@@ -122,6 +167,7 @@ impl Default for NeonStorageControllerConf {
     fn default() -> Self {
         Self {
             max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
+            split_threshold: None,
         }
     }
 }
@@ -141,24 +187,18 @@ impl NeonBroker {
     }
 }
 
+// neon_local needs to know this subset of pageserver configuration.
+// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
+// It can get stale if `pageserver.toml` is changed.
+// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
-    // node id
     pub id: NodeId,
-
-    // Pageserver connection settings
     pub listen_pg_addr: String,
     pub listen_http_addr: String,
-
-    // auth type used for the PG and HTTP ports
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
-
-    pub(crate) virtual_file_io_engine: Option<String>,
-    pub(crate) get_vectored_impl: Option<String>,
-    pub(crate) get_impl: Option<String>,
-    pub(crate) validate_vectored_get: Option<bool>,
 }
 
 impl Default for PageServerConf {
@@ -169,10 +209,40 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
-            get_vectored_impl: None,
-            get_impl: None,
-            validate_vectored_get: None,
+        }
+    }
+}
+
+/// The toml that can be passed to `neon_local init --config`.
+/// This is a subset of the `pageserver.toml` configuration.
+// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
+#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
+pub struct NeonLocalInitPageserverConf {
+    pub id: NodeId,
+    pub listen_pg_addr: String,
+    pub listen_http_addr: String,
+    pub pg_auth_type: AuthType,
+    pub http_auth_type: AuthType,
+    #[serde(flatten)]
+    pub other: HashMap<String, toml::Value>,
+}
+
+impl From<&NeonLocalInitPageserverConf> for PageServerConf {
+    fn from(conf: &NeonLocalInitPageserverConf) -> Self {
+        let NeonLocalInitPageserverConf {
+            id,
+            listen_pg_addr,
+            listen_http_addr,
+            pg_auth_type,
+            http_auth_type,
+            other: _,
+        } = conf;
+        Self {
+            id: *id,
+            listen_pg_addr: listen_pg_addr.clone(),
+            listen_http_addr: listen_http_addr.clone(),
+            pg_auth_type: *pg_auth_type,
+            http_auth_type: *http_auth_type,
         }
     }
 }
@@ -360,44 +430,7 @@ impl LocalEnv {
             .collect()
     }
 
-    /// Create a LocalEnv from a config file.
-    ///
-    /// Unlike 'load_config', this function fills in any defaults that are missing
-    /// from the config file.
-    pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
-        let mut env: LocalEnv = toml::from_str(toml)?;
-
-        // Find postgres binaries.
-        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
-        // Note that later in the code we assume, that distrib dirs follow the same pattern
-        // for all postgres versions.
-        if env.pg_distrib_dir == Path::new("") {
-            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-                env.pg_distrib_dir = postgres_bin.into();
-            } else {
-                let cwd = env::current_dir()?;
-                env.pg_distrib_dir = cwd.join("pg_install")
-            }
-        }
-
-        // Find neon binaries.
-        if env.neon_distrib_dir == Path::new("") {
-            env::current_exe()?
-                .parent()
-                .unwrap()
-                .clone_into(&mut env.neon_distrib_dir);
-        }
-
-        if env.pageservers.is_empty() {
-            anyhow::bail!("Configuration must contain at least one pageserver");
-        }
-
-        env.base_data_dir = base_path();
-
-        Ok(env)
-    }
-
-    /// Locate and load config
+    ///  Construct `Self` from on-disk state.
     pub fn load_config() -> anyhow::Result<Self> {
         let repopath = base_path();
 
@@ -411,38 +444,129 @@ impl LocalEnv {
         // TODO: check that it looks like a neon repository
 
         // load and parse file
-        let config = fs::read_to_string(repopath.join("config"))?;
-        let mut env: LocalEnv = toml::from_str(config.as_str())?;
+        let config_file_contents = fs::read_to_string(repopath.join("config"))?;
+        let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
+        let mut env = {
+            let OnDiskConfig {
+                pg_distrib_dir,
+                neon_distrib_dir,
+                default_tenant_id,
+                private_key_path,
+                broker,
+                storage_controller,
+                pageservers,
+                safekeepers,
+                control_plane_api,
+                control_plane_compute_hook_api,
+                branch_name_mappings,
+            } = on_disk_config;
+            LocalEnv {
+                base_data_dir: repopath.clone(),
+                pg_distrib_dir,
+                neon_distrib_dir,
+                default_tenant_id,
+                private_key_path,
+                broker,
+                storage_controller,
+                pageservers,
+                safekeepers,
+                control_plane_api,
+                control_plane_compute_hook_api,
+                branch_name_mappings,
+            }
+        };
 
-        env.base_data_dir = repopath;
+        // The source of truth for pageserver configuration is the pageserver.toml.
+        assert!(
+            env.pageservers.is_empty(),
+            "we ensure this during deserialization"
+        );
+        env.pageservers = {
+            let iter = std::fs::read_dir(&repopath).context("open dir")?;
+            let mut pageservers = Vec::new();
+            for res in iter {
+                let dentry = res?;
+                const PREFIX: &str = "pageserver_";
+                let dentry_name = dentry
+                    .file_name()
+                    .into_string()
+                    .ok()
+                    .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
+                    .unwrap();
+                if !dentry_name.starts_with(PREFIX) {
+                    continue;
+                }
+                if !dentry.file_type().context("determine file type")?.is_dir() {
+                    anyhow::bail!("expected a directory, got {:?}", dentry.path());
+                }
+                let id = dentry_name[PREFIX.len()..]
+                    .parse::<NodeId>()
+                    .with_context(|| format!("parse id from {:?}", dentry.path()))?;
+                // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
+                #[derive(serde::Serialize, serde::Deserialize)]
+                // (allow unknown fields, unlike PageServerConf)
+                struct PageserverConfigTomlSubset {
+                    id: NodeId,
+                    listen_pg_addr: String,
+                    listen_http_addr: String,
+                    pg_auth_type: AuthType,
+                    http_auth_type: AuthType,
+                }
+                let config_toml_path = dentry.path().join("pageserver.toml");
+                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
+                    &std::fs::read_to_string(&config_toml_path)
+                        .with_context(|| format!("read {:?}", config_toml_path))?,
+                )
+                .context("parse pageserver.toml")?;
+                let PageserverConfigTomlSubset {
+                    id: config_toml_id,
+                    listen_pg_addr,
+                    listen_http_addr,
+                    pg_auth_type,
+                    http_auth_type,
+                } = config_toml;
+                let conf = PageServerConf {
+                    id: {
+                        anyhow::ensure!(
+                            config_toml_id == id,
+                            "id mismatch: config_toml.id={config_toml_id} id={id}",
+                        );
+                        id
+                    },
+                    listen_pg_addr,
+                    listen_http_addr,
+                    pg_auth_type,
+                    http_auth_type,
+                };
+                pageservers.push(conf);
+            }
+            pageservers
+        };
 
         Ok(env)
     }
 
-    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
-        // Currently, the user first passes a config file with 'neon_local init --config=<path>'
-        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
-        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
-        // a bit sad.
-        let mut conf_content = r#"# This file describes a local deployment of the page server
-# and safekeeeper node. It is read by the 'neon_local' command-line
-# utility.
-"#
-        .to_string();
-
-        // Convert the LocalEnv to a toml file.
-        //
-        // This could be as simple as this:
-        //
-        // conf_content += &toml::to_string_pretty(env)?;
-        //
-        // But it results in a "values must be emitted before tables". I'm not sure
-        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
-        // Maybe rust reorders the fields to squeeze avoid padding or something?
-        // In any case, converting to toml::Value first, and serializing that, works.
-        // See https://github.com/alexcrichton/toml-rs/issues/142
-        conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;
+    pub fn persist_config(&self) -> anyhow::Result<()> {
+        Self::persist_config_impl(
+            &self.base_data_dir,
+            &OnDiskConfig {
+                pg_distrib_dir: self.pg_distrib_dir.clone(),
+                neon_distrib_dir: self.neon_distrib_dir.clone(),
+                default_tenant_id: self.default_tenant_id,
+                private_key_path: self.private_key_path.clone(),
+                broker: self.broker.clone(),
+                storage_controller: self.storage_controller.clone(),
+                pageservers: vec![], // it's skip_serializing anyway
+                safekeepers: self.safekeepers.clone(),
+                control_plane_api: self.control_plane_api.clone(),
+                control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
+                branch_name_mappings: self.branch_name_mappings.clone(),
+            },
+        )
+    }
 
+    pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
+        let conf_content = &toml::to_string_pretty(config)?;
         let target_config_path = base_path.join("config");
         fs::write(&target_config_path, conf_content).with_context(|| {
             format!(
@@ -467,17 +591,13 @@ impl LocalEnv {
         }
     }
 
-    //
-    // Initialize a new Neon repository
-    //
-    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
-        // check if config already exists
-        let base_path = &self.base_data_dir;
-        ensure!(
-            base_path != Path::new(""),
-            "repository base path is missing"
-        );
+    /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
+    pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
+        let base_path = base_path();
+        assert_ne!(base_path, Path::new(""));
+        let base_path = &base_path;
 
+        // create base_path dir
         if base_path.exists() {
             match force {
                 InitForceMode::MustNotExist => {
@@ -509,74 +629,96 @@ impl LocalEnv {
                 }
             }
         }
-
-        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
-            bail!(
-                "Can't find postgres binary at {}",
-                self.pg_bin_dir(pg_version)?.display()
-            );
-        }
-        for binary in ["pageserver", "safekeeper"] {
-            if !self.neon_distrib_dir.join(binary).exists() {
-                bail!(
-                    "Can't find binary '{binary}' in neon distrib dir '{}'",
-                    self.neon_distrib_dir.display()
-                );
-            }
-        }
-
         if !base_path.exists() {
             fs::create_dir(base_path)?;
         }
 
+        let NeonLocalInitConf {
+            pg_distrib_dir,
+            neon_distrib_dir,
+            default_tenant_id,
+            broker,
+            storage_controller,
+            pageservers,
+            safekeepers,
+            control_plane_api,
+            control_plane_compute_hook_api,
+        } = conf;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
+        // Note that later in the code we assume, that distrib dirs follow the same pattern
+        // for all postgres versions.
+        let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                postgres_bin.into()
+            } else {
+                let cwd = env::current_dir().unwrap();
+                cwd.join("pg_install")
+            }
+        });
+
+        // Find neon binaries.
+        let neon_distrib_dir = neon_distrib_dir
+            .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
+
         // Generate keypair for JWT.
         //
         // The keypair is only needed if authentication is enabled in any of the
         // components. For convenience, we generate the keypair even if authentication
         // is not enabled, so that you can easily enable it after the initialization
-        // step. However, if the key generation fails, we treat it as non-fatal if
-        // authentication was not enabled.
-        if self.private_key_path == PathBuf::new() {
-            match generate_auth_keys(
-                base_path.join("auth_private_key.pem").as_path(),
-                base_path.join("auth_public_key.pem").as_path(),
-            ) {
-                Ok(()) => {
-                    self.private_key_path = PathBuf::from("auth_private_key.pem");
-                }
-                Err(e) => {
-                    if !self.auth_keys_needed() {
-                        eprintln!("Could not generate keypair for JWT authentication: {e}");
-                        eprintln!("Continuing anyway because authentication was not enabled");
-                        self.private_key_path = PathBuf::from("auth_private_key.pem");
-                    } else {
-                        return Err(e);
-                    }
-                }
-            }
+        // step.
+        generate_auth_keys(
+            base_path.join("auth_private_key.pem").as_path(),
+            base_path.join("auth_public_key.pem").as_path(),
+        )
+        .context("generate auth keys")?;
+        let private_key_path = PathBuf::from("auth_private_key.pem");
+
+        // create the runtime type because the remaining initialization code below needs
+        // a LocalEnv instance op operation
+        // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
+        let env = LocalEnv {
+            base_data_dir: base_path.clone(),
+            pg_distrib_dir,
+            neon_distrib_dir,
+            default_tenant_id: Some(default_tenant_id),
+            private_key_path,
+            broker,
+            storage_controller: storage_controller.unwrap_or_default(),
+            pageservers: pageservers.iter().map(Into::into).collect(),
+            safekeepers,
+            control_plane_api: control_plane_api.unwrap_or_default(),
+            control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
+            branch_name_mappings: Default::default(),
+        };
+
+        // create endpoints dir
+        fs::create_dir_all(env.endpoints_path())?;
+
+        // create safekeeper dirs
+        for safekeeper in &env.safekeepers {
+            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
         }
 
-        fs::create_dir_all(self.endpoints_path())?;
-
-        for safekeeper in &self.safekeepers {
-            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
+        // initialize pageserver state
+        for (i, ps) in pageservers.into_iter().enumerate() {
+            let runtime_ps = &env.pageservers[i];
+            assert_eq!(&PageServerConf::from(&ps), runtime_ps);
+            fs::create_dir(env.pageserver_data_dir(ps.id))?;
+            PageServerNode::from_env(&env, runtime_ps)
+                .initialize(ps)
+                .context("pageserver init failed")?;
         }
 
-        for ps in &self.pageservers {
-            fs::create_dir(self.pageserver_data_dir(ps.id))?;
-        }
+        // setup remote remote location for default LocalFs remote storage
+        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
 
-        self.persist_config(base_path)
-    }
-
-    fn auth_keys_needed(&self) -> bool {
-        self.pageservers.iter().any(|ps| {
-            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
-        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+        env.persist_config()
     }
 }
 
-fn base_path() -> PathBuf {
+pub fn base_path() -> PathBuf {
     match std::env::var_os("NEON_REPO_DIR") {
         Some(val) => PathBuf::from(val),
         None => PathBuf::from(".neon"),
@@ -619,31 +761,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
     }
     Ok(())
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn simple_conf_parsing() {
-        let simple_conf_toml = include_str!("../simple.conf");
-        let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
-        assert!(
-            simple_conf_parse_result.is_ok(),
-            "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
-        );
-
-        let string_to_replace = "listen_addr = '127.0.0.1:50051'";
-        let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
-        let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
-        assert!(
-            spoiled_url_toml.contains(spoiled_url_str),
-            "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
-        );
-        let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
-        assert!(
-            spoiled_url_parse_result.is_err(),
-            "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
-        );
-    }
-}
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 6046c93bad..5a84763697 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -30,7 +30,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use crate::local_env::PageServerConf;
+use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
 use crate::{background_process, local_env::LocalEnv};
 
 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -76,9 +76,11 @@ impl PageServerNode {
 
     fn pageserver_init_make_toml(
         &self,
-        cli_overrides: toml_edit::Document,
+        conf: NeonLocalInitPageserverConf,
     ) -> anyhow::Result<toml_edit::Document> {
-        // TODO: this is a legacy code, it should be refactored to use toml_edit directly.
+        assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
+
+        // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
 
         // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
         let pg_distrib_dir_param = format!(
@@ -86,61 +88,9 @@ impl PageServerNode {
             self.env.pg_distrib_dir_raw().display()
         );
 
-        let PageServerConf {
-            id,
-            listen_pg_addr,
-            listen_http_addr,
-            pg_auth_type,
-            http_auth_type,
-            virtual_file_io_engine,
-            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
-        } = &self.conf;
-
-        let id = format!("id={}", id);
-
-        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
-
-        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
-            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
-        } else {
-            String::new()
-        };
-        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
-            format!("get_vectored_impl='{get_vectored_impl}'")
-        } else {
-            String::new()
-        };
-        let get_impl = if let Some(get_impl) = get_impl {
-            format!("get_impl='{get_impl}'")
-        } else {
-            String::new()
-        };
-        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
-            format!("validate_vectored_get={validate_vectored_get}")
-        } else {
-            String::new()
-        };
-
         let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
 
-        let mut overrides = vec![
-            id,
-            pg_distrib_dir_param,
-            http_auth_type_param,
-            pg_auth_type_param,
-            listen_http_addr_param,
-            listen_pg_addr_param,
-            broker_endpoint_param,
-            virtual_file_io_engine,
-            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
-        ];
+        let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
 
         if let Some(control_plane_api) = &self.env.control_plane_api {
             overrides.push(format!(
@@ -150,7 +100,7 @@ impl PageServerNode {
 
             // Storage controller uses the same auth as pageserver: if JWT is enabled
             // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(conf.http_auth_type, AuthType::NeonJWT) {
                 let jwt_token = self
                     .env
                     .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -159,20 +109,23 @@ impl PageServerNode {
             }
         }
 
-        if !cli_overrides.contains_key("remote_storage") {
+        if !conf.other.contains_key("remote_storage") {
             overrides.push(format!(
                 "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
             ));
         }
 
-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
             // Keys are generated in the toplevel repo dir, pageservers' workdirs
             // are one level below that, so refer to keys with ../
             overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
         }
 
         // Apply the user-provided overrides
-        overrides.push(cli_overrides.to_string());
+        overrides.push(
+            toml_edit::ser::to_string_pretty(&conf)
+                .expect("we deserialized this from toml earlier"),
+        );
 
         // Turn `overrides` into a toml document.
         // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
@@ -188,8 +141,8 @@ impl PageServerNode {
     }
 
     /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, config_overrides: toml_edit::Document) -> anyhow::Result<()> {
-        self.pageserver_init(config_overrides)
+    pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
+        self.pageserver_init(conf)
             .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
     }
 
@@ -209,7 +162,7 @@ impl PageServerNode {
         self.start_node().await
     }
 
-    fn pageserver_init(&self, cli_overrides: toml_edit::Document) -> anyhow::Result<()> {
+    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
         let datadir = self.repo_path();
         let node_id = self.conf.id;
         println!(
@@ -221,7 +174,7 @@ impl PageServerNode {
         io::stdout().flush()?;
 
         let config = self
-            .pageserver_init_make_toml(cli_overrides)
+            .pageserver_init_make_toml(conf)
             .context("make pageserver toml")?;
         let config_file_path = datadir.join("pageserver.toml");
         let mut config_file = std::fs::OpenOptions::new()
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f1c43f4036..b6b7ea7762 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -243,9 +243,13 @@ impl StorageController {
                 anyhow::bail!("initdb failed with status {status}");
             }
 
+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
             tokio::fs::write(
                 &pg_data_path.join("postgresql.conf"),
-                format!("port = {}", self.postgres_port),
+                format!("port = {}\nfsync=off\n", self.postgres_port),
             )
             .await?;
         };
@@ -305,6 +309,10 @@ impl StorageController {
             ));
         }
 
+        if let Some(split_threshold) = self.config.split_threshold.as_ref() {
+            args.push(format!("--split-threshold={split_threshold}"))
+        }
+
         background_process::start_process(
             COMMAND,
             &self.env.base_data_dir,
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index f1b1986072..974dcd7f03 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -1,4 +1,4 @@
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG REPOSITORY=neondatabase
 ARG COMPUTE_IMAGE=compute-node-v14
 ARG TAG=latest
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index e18b0f9176..062fc6fc92 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -8,8 +8,6 @@
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
 
-# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
-
 set -eux -o pipefail
 
 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index fd0c90d447..d05d625b0a 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,7 +3,7 @@
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};
 
-use crate::spec::ComputeSpec;
+use crate::spec::{ComputeSpec, Database, Role};
 
 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -113,6 +113,12 @@ pub struct ComputeMetrics {
     pub total_ext_download_size: u64,
 }
 
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct CatalogObjects {
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+}
+
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 /// This is not actually a compute API response, so consider moving
 /// to a different place.
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index c0c4710a00..12c6dc3a6d 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,7 +307,7 @@ impl KeySpace {
     }
 
     /// Merge another keyspace into the current one.
-    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
     pub fn merge(&mut self, other: &KeySpace) {
         let all_ranges = self
             .ranges
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 1df5820fb9..9311dab33c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,7 @@ use std::{
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
+    sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
 
@@ -161,6 +161,22 @@ impl std::fmt::Debug for TenantState {
     }
 }
 
+/// A temporary lease to a specific lsn inside a timeline.
+/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
+#[serde_as]
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct LsnLease {
+    #[serde_as(as = "SystemTimeAsRfc3339Millis")]
+    pub valid_until: SystemTime,
+}
+
+serde_with::serde_conv!(
+    SystemTimeAsRfc3339Millis,
+    SystemTime,
+    |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
+    |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
+);
+
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
@@ -289,7 +305,7 @@ pub struct TenantConfig {
     pub compaction_period: Option<String>,
     pub compaction_threshold: Option<usize>,
     // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
     pub gc_horizon: Option<u64>,
     pub gc_period: Option<String>,
     pub image_creation_threshold: Option<usize>,
@@ -308,28 +324,100 @@ pub struct TenantConfig {
     pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
+/// tenant config. When the first aux file written, the policy will be persisted in the
+/// `index_part.json` file and has a limited migration path.
+///
+/// Currently, we only allow the following migration path:
+///
+/// Unset -> V1
+///       -> V2
+///       -> CrossValidation -> V2
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum AuxFilePolicy {
+    /// V1 aux file policy: store everything in AUX_FILE_KEY
+    #[strum(ascii_case_insensitive)]
     V1,
+    /// V2 aux file policy: store in the AUX_FILE keyspace
+    #[strum(ascii_case_insensitive)]
     V2,
+    /// Cross validation runs both formats on the write path and does validation
+    /// on the read path.
+    #[strum(ascii_case_insensitive)]
     CrossValidation,
 }
 
-impl FromStr for AuxFilePolicy {
-    type Err = anyhow::Error;
+impl AuxFilePolicy {
+    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
+        matches!(
+            (from, to),
+            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
+        )
+    }
 
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let s = s.to_lowercase();
-        if s == "v1" {
-            Ok(Self::V1)
-        } else if s == "v2" {
-            Ok(Self::V2)
-        } else if s == "crossvalidation" || s == "cross_validation" {
-            Ok(Self::CrossValidation)
-        } else {
-            anyhow::bail!("cannot parse {} to aux file policy", s)
+    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
+    pub fn default_tenant_config() -> Self {
+        Self::V1
+    }
+}
+
+/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
+pub struct AtomicAuxFilePolicy(AtomicUsize);
+
+impl AtomicAuxFilePolicy {
+    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
+        Self(AtomicUsize::new(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+        ))
+    }
+
+    pub fn load(&self) -> Option<AuxFilePolicy> {
+        match self.0.load(std::sync::atomic::Ordering::Acquire) {
+            0 => None,
+            other => Some(AuxFilePolicy::from_usize(other)),
         }
     }
+
+    pub fn store(&self, policy: Option<AuxFilePolicy>) {
+        self.0.store(
+            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
+            std::sync::atomic::Ordering::Release,
+        );
+    }
+}
+
+impl AuxFilePolicy {
+    pub fn to_usize(self) -> usize {
+        match self {
+            Self::V1 => 1,
+            Self::CrossValidation => 2,
+            Self::V2 => 3,
+        }
+    }
+
+    pub fn try_from_usize(this: usize) -> Option<Self> {
+        match this {
+            1 => Some(Self::V1),
+            2 => Some(Self::CrossValidation),
+            3 => Some(Self::V2),
+            _ => None,
+        }
+    }
+
+    pub fn from_usize(this: usize) -> Self {
+        Self::try_from_usize(this).unwrap()
+    }
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -350,13 +438,28 @@ impl EvictionPolicy {
     }
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum CompactionAlgorithm {
     Legacy,
     Tiered,
 }
 
+#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
+pub struct CompactionAlgorithmSettings {
+    pub kind: CompactionAlgorithm,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
     #[serde(with = "humantime_serde")]
@@ -604,6 +707,9 @@ pub struct TimelineInfo {
     pub state: TimelineState,
 
     pub walreceiver_status: String,
+
+    /// The last aux file policy being used on this timeline
+    pub last_aux_file_policy: Option<AuxFilePolicy>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -710,6 +816,8 @@ pub enum HistoricLayerInfo {
         lsn_end: Lsn,
         remote: bool,
         access_stats: LayerAccessStats,
+
+        l0: bool,
     },
     Image {
         layer_file_name: String,
@@ -745,6 +853,16 @@ impl HistoricLayerInfo {
         };
         *field = value;
     }
+    pub fn layer_file_size(&self) -> u64 {
+        match self {
+            HistoricLayerInfo::Delta {
+                layer_file_size, ..
+            } => *layer_file_size,
+            HistoricLayerInfo::Image {
+                layer_file_size, ..
+            } => *layer_file_size,
+        }
+    }
 }
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -752,6 +870,16 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
     pub max_concurrent_downloads: NonZeroUsize,
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct IngestAuxFilesRequest {
+    pub aux_files: HashMap<String, String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct ListAuxFilesRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct DownloadRemoteLayersTaskInfo {
     pub task_id: String,
@@ -776,9 +904,6 @@ pub struct TimelineGcRequest {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
     pub pid: u32,
-    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
-    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
-    pub kind: Cow<'static, str>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -817,6 +942,55 @@ pub struct TenantScanRemoteStorageResponse {
     pub shards: Vec<TenantScanRemoteStorageShard>,
 }
 
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum TenantSorting {
+    ResidentSize,
+    MaxLogicalSize,
+}
+
+impl Default for TenantSorting {
+    fn default() -> Self {
+        Self::ResidentSize
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct TopTenantShardsRequest {
+    // How would you like to sort the tenants?
+    pub order_by: TenantSorting,
+
+    // How many results?
+    pub limit: usize,
+
+    // Omit tenants with more than this many shards (e.g. if this is the max number of shards
+    // that the caller would ever split to)
+    pub where_shards_lt: Option<ShardCount>,
+
+    // Omit tenants where the ordering metric is less than this (this is an optimization to
+    // let us quickly exclude numerous tiny shards)
+    pub where_gt: Option<u64>,
+}
+
+#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
+pub struct TopTenantShardItem {
+    pub id: TenantShardId,
+
+    /// Total size of layers on local disk for all timelines in this tenant
+    pub resident_size: u64,
+
+    /// Total size of layers in remote storage for all timelines in this tenant
+    pub physical_size: u64,
+
+    /// The largest logical size of a timeline within this tenant
+    pub max_logical_size: u64,
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub struct TopTenantShardsResponse {
+    pub shards: Vec<TopTenantShardItem>,
+}
+
 pub mod virtual_file {
     #[derive(
         Copy,
@@ -1242,6 +1416,7 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
     use serde_json::json;
+    use std::str::FromStr;
 
     use super::*;
 
@@ -1449,4 +1624,69 @@ mod tests {
             assert_eq!(actual, expected, "example on {line}");
         }
     }
+
+    #[test]
+    fn test_aux_file_migration_path() {
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V1
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::V2
+        ));
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            None,
+            AuxFilePolicy::CrossValidation
+        ));
+        // Self-migration is not a valid migration path, and the caller should handle it by itself.
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations not allowed
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::V2
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::V1
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V2),
+            AuxFilePolicy::CrossValidation
+        ));
+        assert!(!AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::V1),
+            AuxFilePolicy::CrossValidation
+        ));
+        // Migrations allowed
+        assert!(AuxFilePolicy::is_valid_migration_path(
+            Some(AuxFilePolicy::CrossValidation),
+            AuxFilePolicy::V2
+        ));
+    }
+
+    #[test]
+    fn test_aux_parse() {
+        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(
+            AuxFilePolicy::from_str("cross-validation").unwrap(),
+            AuxFilePolicy::CrossValidation
+        );
+    }
 }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index ff6d3d91b6..1c05a01926 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -125,7 +125,7 @@ impl ShardCount {
 
     /// `v` may be zero, or the number of shards in the tenant.  `v` is what
     /// [`Self::literal`] would return.
-    pub fn new(val: u8) -> Self {
+    pub const fn new(val: u8) -> Self {
         Self(val)
     }
 }
@@ -559,6 +559,14 @@ impl ShardIdentity {
         }
     }
 
+    /// Obtains the shard number and count combined into a `ShardIndex`.
+    pub fn shard_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_count: self.count,
+            shard_number: self.number,
+        }
+    }
+
     pub fn shard_slug(&self) -> String {
         if self.count > ShardCount(0) {
             format!("-{:02x}{:02x}", self.number.0, self.count.0)
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 260018ad89..6c41b7f347 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -820,10 +820,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         Ok(ProcessMsgResult::Continue)
     }
 
-    /// Log as info/error result of handling COPY stream and send back
-    /// ErrorResponse if that makes sense. Shutdown the stream if we got
-    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
-    /// close.
+    /// - Log as info/error result of handling COPY stream and send back
+    ///   ErrorResponse if that makes sense.
+    /// - Shutdown the stream if we got Terminate.
+    /// - Then close the connection because we don't handle exiting from COPY
+    ///   stream normally.
     pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
         use CopyStreamHandlerEnd::*;
 
@@ -849,10 +850,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             }
         }
 
-        if let Terminate = &end {
-            self.state = ProtoState::Closed;
-        }
-
         let err_to_send_and_errcode = match &end {
             ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
             Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -882,6 +879,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 error!("failed to send ErrorResponse: {}", ee);
             }
         }
+
+        // Proper COPY stream finishing to continue using the connection is not
+        // implemented at the server side (we don't need it so far). To prevent
+        // further usages of the connection, close it.
+        self.framed.shutdown().await.ok();
+        self.state = ProtoState::Closed;
     }
 }
 
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 262068cbda..6052f04d11 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
             // Is there enough space on the page for another logical message and an
             // XLOG_SWITCH? If not, start over.
             let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+            if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
                 continue;
             }
 
@@ -373,31 +373,29 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
                 "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
                 &[&(repeats as i32)],
             )?;
-            break;
-        }
-        info!(
-            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
-            client.pg_current_wal_insert_lsn()?,
-            XLOG_SIZE_OF_XLOG_RECORD
-        );
+            info!(
+                "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+                client.pg_current_wal_insert_lsn()?,
+                XLOG_SIZE_OF_XLOG_RECORD
+            );
 
-        // Emit the XLOG_SWITCH
-        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let next_segment = PgLsn::from(0x0200_0000);
-        ensure!(
-            xlog_switch_record_end < next_segment,
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
-            next_segment
-        );
-        ensure!(
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            xlog_switch_record_end,
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
-        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+            // Emit the XLOG_SWITCH
+            let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+            let xlog_switch_record_end: PgLsn =
+                client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+
+            if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+                != XLOG_SIZE_OF_XLOG_SHORT_PHD
+            {
+                warn!(
+                    "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
+                    xlog_switch_record_end,
+                    u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+                );
+                continue;
+            }
+            return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
+        }
     }
 }
 
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 24c1248304..220d4ef115 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,6 +29,7 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 
+use crate::RemoteStorageActivity;
 use crate::{
     error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
     DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
         // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
         Err(TimeTravelError::Unimplemented)
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 708662f20f..f024021507 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
         done_if_after: SystemTime,
         cancel: &CancellationToken,
     ) -> Result<(), TimeTravelError>;
+
+    /// Query how busy we currently are: may be used by callers which wish to politely
+    /// back off if there are already a lot of operations underway.
+    fn activity(&self) -> RemoteStorageActivity;
+}
+
+pub struct RemoteStorageActivity {
+    pub read_available: usize,
+    pub read_total: usize,
+    pub write_available: usize,
+    pub write_total: usize,
 }
 
 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -444,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
             }
         }
     }
+
+    pub fn activity(&self) -> RemoteStorageActivity {
+        match self {
+            Self::LocalFs(s) => s.activity(),
+            Self::AwsS3(s) => s.activity(),
+            Self::AzureBlob(s) => s.activity(),
+            Self::Unreliable(s) => s.activity(),
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -774,6 +794,9 @@ struct ConcurrencyLimiter {
     // The helps to ensure we don't exceed the thresholds.
     write: Arc<Semaphore>,
     read: Arc<Semaphore>,
+
+    write_total: usize,
+    read_total: usize,
 }
 
 impl ConcurrencyLimiter {
@@ -802,10 +825,21 @@ impl ConcurrencyLimiter {
         Arc::clone(self.for_kind(kind)).acquire_owned().await
     }
 
+    fn activity(&self) -> RemoteStorageActivity {
+        RemoteStorageActivity {
+            read_available: self.read.available_permits(),
+            read_total: self.read_total,
+            write_available: self.write.available_permits(),
+            write_total: self.write_total,
+        }
+    }
+
     fn new(limit: usize) -> ConcurrencyLimiter {
         Self {
             read: Arc::new(Semaphore::new(limit)),
             write: Arc::new(Semaphore::new(limit)),
+            read_total: limit,
+            write_total: limit,
         }
     }
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 1f7bcfc982..f12f6590a3 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;
 
 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
+    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
     ) -> Result<(), TimeTravelError> {
         Err(TimeTravelError::Unimplemented)
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
+        RemoteStorageActivity {
+            read_available: 16,
+            read_total: 16,
+            write_available: 16,
+            write_total: 16,
+        }
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index c0b89cee2a..0f6772b274 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -27,7 +27,7 @@ use aws_config::{
 };
 use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
+    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
     error::SdkError,
     operation::get_object::GetObjectError,
     types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
     error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
 pub(super) mod metrics;
@@ -75,13 +75,13 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
     /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
         tracing::debug!(
             "Creating s3 remote storage for S3 bucket {}",
-            aws_config.bucket_name
+            remote_storage_config.bucket_name
         );
 
-        let region = Some(Region::new(aws_config.bucket_region.clone()));
+        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
 
         let provider_conf = ProviderConfig::without_region().with_region(region.clone());
 
@@ -113,6 +113,38 @@ impl S3Bucket {
         // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
         let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
 
+        let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
+            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
+            BehaviorVersion::v2023_11_09(),
+        )
+        .region(region)
+        .identity_cache(IdentityCache::lazy().build())
+        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+        .sleep_impl(SharedAsyncSleep::from(sleep_impl));
+
+        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
+            s.spawn(|| {
+                // TODO: make this function async.
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()
+                    .unwrap()
+                    .block_on(sdk_config_loader.load())
+            })
+            .join()
+            .unwrap()
+        });
+
+        let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
+
+        // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
+        // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
+        if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
+            s3_config_builder = s3_config_builder
+                .endpoint_url(custom_endpoint)
+                .force_path_style(true);
+        }
+
         // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
         // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
         // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -120,42 +152,36 @@ impl S3Bucket {
         retry_config
             .set_max_attempts(Some(1))
             .set_mode(Some(RetryMode::Adaptive));
+        s3_config_builder = s3_config_builder.retry_config(retry_config.build());
 
-        let mut config_builder = Builder::default()
-            .behavior_version(BehaviorVersion::v2023_11_09())
-            .region(region)
-            .identity_cache(IdentityCache::lazy().build())
-            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
-            .retry_config(retry_config.build())
-            .sleep_impl(SharedAsyncSleep::from(sleep_impl));
+        let s3_config = s3_config_builder.build();
+        let client = aws_sdk_s3::Client::from_conf(s3_config);
 
-        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
-            config_builder = config_builder
-                .endpoint_url(custom_endpoint)
-                .force_path_style(true);
-        }
+        let prefix_in_bucket = remote_storage_config
+            .prefix_in_bucket
+            .as_deref()
+            .map(|prefix| {
+                let mut prefix = prefix;
+                while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix = &prefix[1..]
+                }
 
-        let client = Client::from_conf(config_builder.build());
+                let mut prefix = prefix.to_string();
+                while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                    prefix.pop();
+                }
+                prefix
+            });
 
-        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
-            let mut prefix = prefix;
-            while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                prefix = &prefix[1..]
-            }
-
-            let mut prefix = prefix.to_string();
-            while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                prefix.pop();
-            }
-            prefix
-        });
         Ok(Self {
             client,
-            bucket_name: aws_config.bucket_name.clone(),
-            max_keys_per_list_response: aws_config.max_keys_per_list_response,
+            bucket_name: remote_storage_config.bucket_name.clone(),
+            max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
-            upload_storage_class: aws_config.upload_storage_class.clone(),
+            concurrency_limiter: ConcurrencyLimiter::new(
+                remote_storage_config.concurrency_limit.get(),
+            ),
+            upload_storage_class: remote_storage_config.upload_storage_class.clone(),
             timeout,
         })
     }
@@ -949,6 +975,10 @@ impl RemoteStorage for S3Bucket {
         }
         Ok(())
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }
 
 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index c467a2d196..66522e04ca 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;
 
 use crate::{
     Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata, TimeTravelError,
+    RemoteStorageActivity, StorageMetadata, TimeTravelError,
 };
 
 pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
             .time_travel_recover(prefix, timestamp, done_if_after, cancel)
             .await
     }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.inner.activity()
+    }
 }
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index ce5a1e411e..2fbc333075 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -50,6 +50,9 @@ pub struct SkTimelineInfo {
     pub safekeeper_connstr: Option<String>,
     #[serde(default)]
     pub http_connstr: Option<String>,
+    // Minimum of all active RO replicas flush LSN
+    #[serde(default = "lsn_invalid")]
+    pub standby_horizon: Lsn,
 }
 
 #[derive(Debug, Clone, Deserialize, Serialize)]
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index 0bf5664f47..27378c69fc 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -3,7 +3,7 @@
 //!  # Example
 //!
 //!  ```
-//!  # tokio_test::block_on(async {
+//!  # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async {
 //!  use utils::poison::Poison;
 //!  use std::time::Duration;
 //!
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index fb815607a7..f7b72b205f 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -496,9 +496,9 @@ mod tests {
                 // TODO: When updating Postgres versions, this test will cause
                 // problems. Postgres version in message needs updating.
                 //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                 vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                     147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                     188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 5d05af0c00..1d02aa7709 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,7 +1,7 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
     let mut updates = layer_map.batch_update();
     for fname in filenames {
         let fname = fname.unwrap();
-        let fname = LayerFileName::from_str(&fname).unwrap();
+        let fname = LayerName::from_str(&fname).unwrap();
         let layer = PersistentLayerDesc::from(fname);
 
         let lsn_range = layer.get_lsn_range();
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 5b871c5d5e..5aab10e5d9 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -30,47 +30,27 @@
 //! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
-//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
-//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
-//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
-//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
-//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
-//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
-//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
-//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
-//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
-//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
-//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
-//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
-//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
-//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
-//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
-//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
+//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! ```
 
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{
-    config::PageServerConf,
-    walrecord::NeonWalRecord,
-    walredo::{PostgresRedoManager, ProcessKind},
-};
+use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     sync::Arc,
@@ -80,39 +60,32 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};
 
 fn bench(c: &mut Criterion) {
-    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-short"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::short_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("short");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::short_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
         }
-
-        {
-            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-            for nclients in nclients {
-                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
-                group.bench_with_input(
-                    BenchmarkId::from_parameter(nclients),
-                    &nclients,
-                    |b, nclients| {
-                        let redo_work = Arc::new(Request::medium_input());
-                        b.iter_custom(|iters| {
-                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
-                        });
-                    },
-                );
-            }
+    }
+    {
+        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+        for nclients in nclients {
+            let mut group = c.benchmark_group("medium");
+            group.bench_with_input(
+                BenchmarkId::from_parameter(nclients),
+                &nclients,
+                |b, nclients| {
+                    let redo_work = Arc::new(Request::medium_input());
+                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
+                },
+            );
         }
     }
 }
@@ -120,16 +93,10 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);
 
 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(
-    process_kind: ProcessKind,
-    redo_work: Arc<Request>,
-    n_redos: u64,
-    nclients: u64,
-) -> Duration {
+fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
     let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
 
-    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
-    conf.walredo_process_kind = process_kind;
+    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
     let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
@@ -158,27 +125,13 @@ fn bench_impl(
         });
     }
 
-    let elapsed = rt.block_on(async move {
+    rt.block_on(async move {
         let mut total_wallclock_time = Duration::ZERO;
         while let Some(res) = tasks.join_next().await {
             total_wallclock_time += res.unwrap();
         }
         total_wallclock_time
-    });
-
-    // consistency check to ensure process kind setting worked
-    if nredos_per_client > 0 {
-        assert_eq!(
-            manager
-                .status()
-                .process
-                .map(|p| p.kind)
-                .expect("the benchmark work causes a walredo process to be spawned"),
-            std::borrow::Cow::Borrowed(process_kind.into())
-        );
-    }
-
-    elapsed
+    })
 }
 
 async fn client(
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 6df8b2170d..69b86d9c46 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,8 +1,12 @@
+use std::collections::HashMap;
+
+use bytes::Bytes;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
     http::error::HttpErrorBody,
     id::{TenantId, TimelineId},
+    lsn::Lsn,
 };
 
 pub mod util;
@@ -486,6 +490,18 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn top_tenant_shards(
+        &self,
+        request: TopTenantShardsRequest,
+    ) -> Result<TopTenantShardsResponse> {
+        let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
+        self.request(Method::POST, uri, request)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn layer_map_info(
         &self,
         tenant_shard_id: TenantShardId,
@@ -549,4 +565,57 @@ impl Client {
             }),
         }
     }
+
+    pub async fn ingest_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        aux_files: HashMap<String, String>,
+    ) -> Result<bool> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => Ok(true),
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
+
+    pub async fn list_aux_files(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<HashMap<String, Bytes>> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/list_aux_files",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        );
+        let resp = self
+            .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
+            .await?;
+        match resp.status() {
+            StatusCode::OK => {
+                let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
+                    Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
+                })?;
+                Ok(resp)
+            }
+            status => Err(match resp.json::<HttpErrorBody>().await {
+                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
+                Err(_) => {
+                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
+                }
+            }),
+        }
+    }
 }
diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs
index 1fd69407d3..c308694ae1 100644
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,4 +1,5 @@
 use clap::{Parser, Subcommand};
+use pageserver_compaction::helpers::PAGE_SZ;
 use pageserver_compaction::simulator::MockTimeline;
 use rand::Rng;
 use std::io::Write;
@@ -51,7 +52,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
     let mut executor = MockTimeline::new();
 
     // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
     //let key_range = u64::MIN..u64::MAX;
     println!(
         "starting simulation with key range {:016X}-{:016X}",
diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs
index 137b93055a..20f88868f9 100644
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -24,7 +24,9 @@ use tracing::{debug, info};
 use std::collections::{HashSet, VecDeque};
 use std::ops::Range;
 
-use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
+use crate::helpers::{
+    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
+};
 use crate::interface::*;
 use utils::lsn::Lsn;
 
@@ -104,7 +106,13 @@ pub async fn compact_tiered<E: CompactionJobExecutor>(
             ctx,
         )
         .await?;
-        if target_file_size == u64::MAX {
+        if current_level_target_height == u64::MAX {
+            // our target height includes all possible lsns
+            info!(
+                level = current_level_no,
+                depth = depth,
+                "compaction loop reached max current_level_target_height"
+            );
             break;
         }
         current_level_no += 1;
@@ -371,7 +379,7 @@ where
                 .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                 .await?,
             &self.shard_identity,
-        ) * 8192;
+        ) * PAGE_SZ;
 
         let wal_size = job
             .input_layers
@@ -433,7 +441,7 @@ where
         let mut window = KeyspaceWindow::new(
             E::Key::MIN..E::Key::MAX,
             keyspace,
-            self.target_file_size / 8192,
+            self.target_file_size / PAGE_SZ,
         );
         while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
             new_jobs.push(CompactionJob::<E> {
@@ -522,8 +530,6 @@ where
         // If we have accumulated only a narrow band of keyspace, create an
         // image layer. Otherwise write a delta layer.
 
-        // FIXME: deal with the case of lots of values for same key
-
         // FIXME: we are ignoring images here. Did we already divide the work
         // so that we won't encounter them here?
 
@@ -535,43 +541,101 @@ where
             }
         }
         // Open stream
-        let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
+        let key_value_stream =
+            std::pin::pin!(merge_delta_keys_buffered::<E>(deltas.as_slice(), ctx)
+                .await?
+                .map(Result::<_, anyhow::Error>::Ok));
         let mut new_jobs = Vec::new();
 
         // Slide a window through the keyspace
-        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
+        let mut key_accum =
+            std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
         let mut all_in_window: bool = false;
         let mut window = Window::new();
+
+        // Helper function to create a job for a new delta layer with given key-lsn
+        // rectangle.
+        let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
+            // The inputs for the job are all the input layers of the original job that
+            // overlap with the rectangle.
+            let batch_layers: Vec<LayerId> = job
+                .input_layers
+                .iter()
+                .filter(|layer_id| {
+                    overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                })
+                .cloned()
+                .collect();
+            assert!(!batch_layers.is_empty());
+            new_jobs.push(CompactionJob {
+                key_range,
+                lsn_range: lsn_range.clone(),
+                strategy: CompactionStrategy::CreateDelta,
+                input_layers: batch_layers,
+                completed: false,
+            });
+        };
+
         loop {
-            if all_in_window && window.elems.is_empty() {
+            if all_in_window && window.is_empty() {
                 // All done!
                 break;
             }
+
+            // If we now have enough keyspace for next delta layer in the window, create a
+            // new delta layer
             if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
             {
-                let batch_layers: Vec<LayerId> = job
-                    .input_layers
-                    .iter()
-                    .filter(|layer_id| {
-                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                    })
-                    .cloned()
-                    .collect();
-                assert!(!batch_layers.is_empty());
-                new_jobs.push(CompactionJob {
-                    key_range,
-                    lsn_range: job.lsn_range.clone(),
-                    strategy: CompactionStrategy::CreateDelta,
-                    input_layers: batch_layers,
-                    completed: false,
-                });
-            } else {
-                assert!(!all_in_window);
-                if let Some(next_key) = key_accum.next().await.transpose()? {
-                    window.feed(next_key.key, next_key.size);
-                } else {
+                create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                continue;
+            }
+            assert!(!all_in_window);
+
+            // Process next key in the key space
+            match key_accum.next().await.transpose()? {
+                None => {
                     all_in_window = true;
                 }
+                Some(next_key) if next_key.partition_lsns.is_empty() => {
+                    // Normal case: extend the window by the key
+                    window.feed(next_key.key, next_key.size);
+                }
+                Some(next_key) => {
+                    // A key with too large size impact for a single delta layer. This
+                    // case occurs if you make a huge number of updates for a single key.
+                    //
+                    // Drain the window with has_more = false to make a clean cut before
+                    // the key, and then make dedicated delta layers for the single key.
+                    //
+                    // We cannot cluster the key with the others, because we don't want
+                    // layer files to overlap with each other in the lsn,key space (no
+                    // overlaps for the rectangles).
+                    let key = next_key.key;
+                    debug!("key {key} with size impact larger than the layer size");
+                    while !window.is_empty() {
+                        let has_more = false;
+                        let key_range = window.choose_next_delta(self.target_file_size, has_more)
+                            .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
+                        create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
+                    }
+
+                    // Not really required: but here for future resilience:
+                    // We make a "gap" here, so any structure the window holds should
+                    // probably be reset.
+                    window = Window::new();
+
+                    let mut prior_lsn = job.lsn_range.start;
+                    let mut lsn_ranges = Vec::new();
+                    for (lsn, _size) in next_key.partition_lsns.iter() {
+                        lsn_ranges.push(prior_lsn..*lsn);
+                        prior_lsn = *lsn;
+                    }
+                    lsn_ranges.push(prior_lsn..job.lsn_range.end);
+                    for lsn_range in lsn_ranges {
+                        let key_range = key..key.next();
+                        create_delta_job(key_range, &lsn_range, &mut new_jobs);
+                    }
+                }
             }
         }
 
@@ -599,8 +663,8 @@ where
     }
 }
 
-// Sliding window through keyspace and values
-// This is used by over_with_images to decide on good split points
+/// Sliding window through keyspace and values for image layer
+/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
 struct KeyspaceWindow<K> {
     head: KeyspaceWindowHead<K>,
 
@@ -740,9 +804,9 @@ struct WindowElement<K> {
     accum_size: u64,
 }
 
-// Sliding window through keyspace and values
-//
-// This is used to decide what layer to write next, from the beginning of the window.
+/// Sliding window through keyspace and values for delta layer tiling
+///
+/// This is used to decide which delta layer to write next.
 struct Window<K> {
     elems: VecDeque<WindowElement<K>>,
 
@@ -766,11 +830,13 @@ where
     fn feed(&mut self, key: K, size: u64) {
         let last_size;
         if let Some(last) = self.elems.back_mut() {
-            assert!(last.last_key <= key);
-            if key == last.last_key {
-                last.accum_size += size;
-                return;
-            }
+            // We require the keys to be strictly increasing for the window.
+            // Keys should already have been deduplicated by `accum_key_values`
+            assert!(
+                last.last_key < key,
+                "last_key(={}) >= key(={key})",
+                last.last_key
+            );
             last_size = last.accum_size;
         } else {
             last_size = 0;
@@ -792,6 +858,10 @@ where
         self.elems.front().unwrap().accum_size - self.splitoff_size
     }
 
+    fn is_empty(&self) -> bool {
+        self.elems.is_empty()
+    }
+
     fn commit_upto(&mut self, mut upto: usize) {
         while upto > 1 {
             let popped = self.elems.pop_front().unwrap();
@@ -854,7 +924,7 @@ where
         // If we're willing to stretch it up to 1.25 target size, could we
         // gobble up the rest of the work? This avoids creating very small
         // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 3 {
+        if !has_more && self.remain_size() < target_size * 5 / 4 {
             self.commit_upto(self.elems.len());
         } else {
             let delta_split_at = self.find_size_split(target_size);
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 1b80373ba7..8ed1d16082 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -9,10 +9,14 @@ use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
+use std::fmt::Display;
 use std::future::Future;
 use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};
+use utils::lsn::Lsn;
+
+pub const PAGE_SZ: u64 = 8192;
 
 pub fn keyspace_total_size<K>(
     keyspace: &CompactionKeySpace<K>,
@@ -108,17 +112,40 @@ pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
     }
 }
 
+pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
+    layers: &'a [E::DeltaLayer],
+    ctx: &'a E::RequestContext,
+) -> anyhow::Result<impl Stream<Item = <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>
+{
+    let mut keys = Vec::new();
+    for l in layers {
+        // Boxing and casting to LoadFuture is required to obtain the right Sync bound.
+        // If we do l.load_keys(ctx).await? directly, there is a compilation error.
+        let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx));
+        keys.extend(load_future.await?.into_iter());
+    }
+    keys.sort_by_key(|k| (k.key(), k.lsn()));
+    let stream = futures::stream::iter(keys.into_iter());
+    Ok(stream)
+}
+
 enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
     Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
     Unloaded(&'a E::DeltaLayer),
 }
 impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn key(&self) -> E::Key {
+    fn min_key(&self) -> E::Key {
         match self {
             Self::Loaded(entries) => entries.front().unwrap().key(),
             Self::Unloaded(dl) => dl.key_range().start,
         }
     }
+    fn min_lsn(&self) -> Lsn {
+        match self {
+            Self::Loaded(entries) => entries.front().unwrap().lsn(),
+            Self::Unloaded(dl) => dl.lsn_range().start,
+        }
+    }
 }
 impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
@@ -128,12 +155,12 @@ impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
 impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         // reverse order so that we get a min-heap
-        other.key().cmp(&self.key())
+        (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
     }
 }
 impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
     fn eq(&self, other: &Self) -> bool {
-        self.key().eq(&other.key())
+        self.cmp(other) == std::cmp::Ordering::Equal
     }
 }
 impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
@@ -210,11 +237,16 @@ pub struct KeySize<K> {
     pub key: K,
     pub num_values: u64,
     pub size: u64,
+    /// The lsns to partition at (if empty then no per-lsn partitioning)
+    pub partition_lsns: Vec<(Lsn, u64)>,
 }
 
-pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
+pub fn accum_key_values<'a, I, K, D, E>(
+    input: I,
+    target_size: u64,
+) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
-    K: Eq,
+    K: Eq + PartialOrd + Display + Copy,
     I: Stream<Item = Result<D, E>>,
     D: CompactionDeltaEntry<'a, K>,
 {
@@ -224,25 +256,39 @@ where
 
         if let Some(first) = input.next().await {
             let first = first?;
+            let mut part_size = first.size();
             let mut accum: KeySize<K> = KeySize {
                 key: first.key(),
                 num_values: 1,
-                size: first.size(),
+                size: part_size,
+                partition_lsns: Vec::new(),
             };
+            let mut last_key = accum.key;
             while let Some(this) = input.next().await {
                 let this = this?;
                 if this.key() == accum.key {
-                    accum.size += this.size();
+                    let add_size = this.size();
+                    if part_size + add_size > target_size {
+                        accum.partition_lsns.push((this.lsn(), part_size));
+                        part_size = 0;
+                    }
+                    part_size += add_size;
+                    accum.size += add_size;
                     accum.num_values += 1;
                 } else {
+                    assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
+                    last_key = accum.key;
                     yield accum;
+                    part_size = this.size();
                     accum = KeySize {
                         key: this.key(),
                         num_values: 1,
-                        size: this.size(),
+                        size: part_size,
+                        partition_lsns: Vec::new(),
                     };
                 }
             }
+            assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
             yield accum;
         }
     }
diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs
index 98dd46925c..1853afffdd 100644
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -184,6 +184,12 @@ impl<L> Level<L> {
         }
         let mut events: Vec<Event<K>> = Vec::new();
         for (idx, l) in self.layers.iter().enumerate() {
+            let key_range = l.key_range();
+            if key_range.end == key_range.start.next() && l.is_delta() {
+                // Ignore single-key delta layers as they can be stacked on top of each other
+                // as that is the only way to cut further.
+                continue;
+            }
             events.push(Event {
                 key: l.key_range().start,
                 layer_idx: idx,
diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs
index 3543df64fa..a7c8bd5c1f 100644
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -14,6 +14,7 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::sync::Mutex;
 
+use crate::helpers::PAGE_SZ;
 use crate::helpers::{merge_delta_keys, overlaps_with};
 
 use crate::interface;
@@ -509,7 +510,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
         let new_layer = Arc::new(MockImageLayer {
             key_range: key_range.clone(),
             lsn_range: lsn..lsn,
-            file_size: accum_size * 8192,
+            file_size: accum_size * PAGE_SZ,
             deleted: Mutex::new(false),
         });
         info!(
diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs
index 1cea2a20e1..bd8b54a286 100644
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,23 +1,35 @@
+use once_cell::sync::OnceCell;
 use pageserver_compaction::interface::CompactionLayer;
 use pageserver_compaction::simulator::MockTimeline;
+use utils::logging;
+
+static LOG_HANDLE: OnceCell<()> = OnceCell::new();
+
+pub(crate) fn setup_logging() {
+    LOG_HANDLE.get_or_init(|| {
+        logging::init(
+            logging::LogFormat::Test,
+            logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+            logging::Output::Stdout,
+        )
+        .expect("Failed to init test logging")
+    });
+}
 
 /// Test the extreme case that there are so many updates for a single key that
 /// even if we produce an extremely narrow delta layer, spanning just that one
 /// key, we still too many records to fit in the target file size. We need to
 /// split in the LSN dimension too in that case.
-///
-/// TODO: The code to avoid this problem has not been implemented yet! So the
-/// assertion currently fails, but we need to make it not fail.
-#[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
+    setup_logging();
     let mut executor = MockTimeline::new();
-    executor.target_file_size = 10_000_000; // 10 MB
+    executor.target_file_size = 1_000_000; // 1 MB
 
-    // Ingest 100 MB of updates to a single key.
+    // Ingest 10 MB of updates to a single key.
     for _ in 1..1000 {
         executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
-        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
+        executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
         executor.compact().await.unwrap();
     }
 
@@ -27,9 +39,32 @@ async fn test_many_updates_for_single_key() {
     }
     for l in executor.live_layers.iter() {
         assert!(l.file_size() < executor.target_file_size * 2);
-        // sanity check that none of the delta layers are stupidly small either
+        // Sanity check that none of the delta layers are empty either.
         if l.is_delta() {
-            assert!(l.file_size() > executor.target_file_size / 2);
+            assert!(l.file_size() > 0);
         }
     }
 }
+
+#[tokio::test]
+async fn test_simple_updates() {
+    setup_logging();
+    let mut executor = MockTimeline::new();
+    executor.target_file_size = 500_000; // 500 KB
+
+    // Ingest some traffic.
+    for _ in 1..400 {
+        executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
+    }
+
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+
+    println!("Running compaction...");
+    executor.compact().await.unwrap();
+
+    for l in executor.live_layers.iter() {
+        println!("layer {}: {}", l.short_id(), l.file_size());
+    }
+}
diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index 9a556cb3d4..389519c65a 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -28,6 +28,8 @@
 //! # From an `index_part.json` in S3
 //! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
 //!
+//! # enrich with lines for gc_cutoff and a child branch point
+//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg
 //! ```
 //!
 //! ## Viewing
@@ -48,9 +50,8 @@
 //! ```
 //!
 
-use anyhow::Result;
+use anyhow::{Context, Result};
 use pageserver::repository::Key;
-use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -81,6 +82,11 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     let split: Vec<&str> = name.split("__").collect();
     let keys: Vec<&str> = split[0].split('-').collect();
     let mut lsns: Vec<&str> = split[1].split('-').collect();
+
+    if lsns.last().expect("should").len() == 8 {
+        lsns.pop();
+    }
+
     if lsns.len() == 1 {
         lsns.push(lsns[0]);
     }
@@ -90,6 +96,33 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     (keys, lsns)
 }
 
+#[derive(Clone, Copy)]
+enum LineKind {
+    GcCutoff,
+    Branch,
+}
+
+impl From<LineKind> for Fill {
+    fn from(value: LineKind) -> Self {
+        match value {
+            LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)),
+            LineKind::Branch => Fill::Color(rgb(0, 255, 0)),
+        }
+    }
+}
+
+impl FromStr for LineKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::prelude::v1::Result<Self, Self::Err> {
+        Ok(match s {
+            "gc_cutoff" => LineKind::GcCutoff,
+            "branch" => LineKind::Branch,
+            _ => anyhow::bail!("unsupported linekind: {s}"),
+        })
+    }
+}
+
 pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
     struct Layer {
@@ -99,15 +132,32 @@ pub fn main() -> Result<()> {
     }
     let mut files: Vec<Layer> = vec![];
     let stdin = io::stdin();
-    for line in stdin.lock().lines() {
+
+    let mut lines: Vec<(Lsn, LineKind)> = vec![];
+
+    for (lineno, line) in stdin.lock().lines().enumerate() {
+        let lineno = lineno + 1;
+
         let line = line.unwrap();
+        if let Some((kind, lsn)) = line.split_once(':') {
+            let (kind, lsn) = LineKind::from_str(kind)
+                .context("parse kind")
+                .and_then(|kind| {
+                    if lsn.contains('/') {
+                        Lsn::from_str(lsn)
+                    } else {
+                        Lsn::from_hex(lsn)
+                    }
+                    .map(|lsn| (kind, lsn))
+                    .context("parse lsn")
+                })
+                .with_context(|| format!("parse {line:?} on {lineno}"))?;
+            lines.push((lsn, kind));
+            continue;
+        }
         let line = PathBuf::from_str(&line).unwrap();
         let filename = line.file_name().unwrap();
         let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
-            // Don't try and parse "metadata" like a key-lsn range
-            continue;
-        }
         let (key_range, lsn_range) = parse_filename(filename);
         files.push(Layer {
             filename: filename.to_owned(),
@@ -117,8 +167,9 @@ pub fn main() -> Result<()> {
     }
 
     // Collect all coordinates
-    let mut keys: Vec<Key> = vec![];
-    let mut lsns: Vec<Lsn> = vec![];
+    let mut keys: Vec<Key> = Vec::with_capacity(files.len());
+    let mut lsns: Vec<Lsn> = Vec::with_capacity(files.len() + lines.len());
+
     for Layer {
         key_range: keyr,
         lsn_range: lsnr,
@@ -131,6 +182,8 @@ pub fn main() -> Result<()> {
         lsns.push(lsnr.end);
     }
 
+    lsns.extend(lines.iter().map(|(lsn, _)| *lsn));
+
     // Analyze
     let key_map = build_coordinate_compression_map(keys);
     let lsn_map = build_coordinate_compression_map(lsns);
@@ -144,10 +197,13 @@ pub fn main() -> Result<()> {
     println!(
         "{}",
         BeginSvg {
-            w: key_map.len() as f32,
+            w: (key_map.len() + 10) as f32,
             h: stretch * lsn_map.len() as f32
         }
     );
+
+    let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
+
     for Layer {
         filename,
         key_range: keyr,
@@ -169,7 +225,6 @@ pub fn main() -> Result<()> {
         let mut lsn_diff = (lsn_end - lsn_start) as f32;
         let mut fill = Fill::None;
         let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
-        let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas
         let mut lsn_offset = 0.0;
 
         // Fill in and thicken rectangle if it's an
@@ -189,7 +244,7 @@ pub fn main() -> Result<()> {
         println!(
             "    {}",
             rectangle(
-                key_start as f32 + stretch * xmargin,
+                5.0 + key_start as f32 + stretch * xmargin,
                 stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
                 key_diff as f32 - stretch * 2.0 * xmargin,
                 stretch * (lsn_diff - 2.0 * ymargin)
@@ -200,6 +255,26 @@ pub fn main() -> Result<()> {
             .comment(filename)
         );
     }
+
+    for (lsn, kind) in lines {
+        let lsn_start = *lsn_map.get(&lsn).unwrap();
+        let lsn_end = lsn_start;
+        let stretch = 2.0;
+        let lsn_diff = 0.3;
+        let lsn_offset = -lsn_diff / 2.0;
+        let ymargin = 0.05;
+        println!(
+            "{}",
+            rectangle(
+                0.0f32 + stretch * xmargin,
+                stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)),
+                (key_map.len() + 10) as f32,
+                stretch * (lsn_diff - 2.0 * ymargin)
+            )
+            .fill(kind)
+        );
+    }
+
     println!("{}", EndSvg);
 
     eprintln!("num_images: {}", num_images);
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 20e5572914..0d010eb009 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
 
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
             #[derive(serde::Serialize)]
             struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
                 disk_consistent_lsn: Lsn,
                 timeline_metadata: &'a TimelineMetadata,
             }
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index c4c282f33d..b4bb239f44 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -100,7 +100,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
     let block_reader = FileBlockReader::new(&file, file_id);
     let summary_blk = block_reader.read_blk(0, ctx).await?;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index be8f91675d..3611b0baab 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,7 +61,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
     virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
-    let file = VirtualFile::open(path).await?;
+    let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
     let block_reader = FileBlockReader::new(&file, file_id);
     let summary_blk = block_reader.read_blk(0, ctx).await?;
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 1fb75584fc..e92c352dab 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -219,6 +219,7 @@ fn handle_metadata(
     let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
     println!("Current metadata:\n{meta:?}");
     let mut update_meta = false;
+    // TODO: simplify this part
     if let Some(disk_consistent_lsn) = disk_consistent_lsn {
         meta = TimelineMetadata::new(
             *disk_consistent_lsn,
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
new file mode 100644
index 0000000000..eb5b242a5f
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -0,0 +1,98 @@
+use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
+use pageserver_api::shard::TenantShardId;
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Ingest aux files into the pageserver.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let main_task = rt.spawn(main_impl(args));
+    rt.block_on(main_task).unwrap()
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: None,
+            targets: {
+                if let Some(targets) = &args.targets {
+                    if targets.len() != 1 {
+                        anyhow::bail!("must specify exactly one target");
+                    }
+                    Some(targets.clone())
+                } else {
+                    None
+                }
+            },
+        },
+    )
+    .await?;
+
+    let timeline = timelines[0];
+    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
+    let timeline_id = timeline.timeline_id;
+
+    println!("operating on timeline {}", timeline);
+
+    mgmt_api_client
+        .tenant_config(&TenantConfigRequest {
+            tenant_id: timeline.tenant_id,
+            config: TenantConfig {
+                switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                ..Default::default()
+            },
+        })
+        .await?;
+
+    for batch in 0..100 {
+        let items = (0..100)
+            .map(|id| {
+                (
+                    format!("pg_logical/mappings/{:03}.{:03}", batch, id),
+                    format!("{:08}", id),
+                )
+            })
+            .collect::<HashMap<_, _>>();
+        let file_cnt = items.len();
+        mgmt_api_client
+            .ingest_aux_files(tenant_shard_id, timeline_id, items)
+            .await?;
+        println!("ingested {file_cnt} files");
+    }
+
+    let files = mgmt_api_client
+        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
+        .await?;
+
+    println!("{} files found", files.len());
+
+    anyhow::Ok(())
+}
diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
index 197e782dca..1bb71b9353 100644
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -2,9 +2,11 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
 
 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};
 
+use std::{f64, sync::Arc};
 use tokio::{
     sync::{mpsc, OwnedSemaphorePermit},
     task::JoinSet,
@@ -12,10 +14,7 @@ use tokio::{
 
 use std::{
     num::NonZeroUsize,
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
+    sync::atomic::{AtomicU64, Ordering},
     time::{Duration, Instant},
 };
 
@@ -51,19 +50,31 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
     Ok(())
 }
 
+#[derive(serde::Serialize)]
+struct Output {
+    downloads_count: u64,
+    downloads_bytes: u64,
+    evictions_count: u64,
+    timeline_restarts: u64,
+    #[serde(with = "humantime_serde")]
+    runtime: Duration,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
-    evictions: AtomicU64,
-    downloads: AtomicU64,
+    evictions_count: AtomicU64,
+    downloads_count: AtomicU64,
+    downloads_bytes: AtomicU64,
     timeline_restarts: AtomicU64,
 }
 
 impl LiveStats {
     fn eviction_done(&self) {
-        self.evictions.fetch_add(1, Ordering::Relaxed);
+        self.evictions_count.fetch_add(1, Ordering::Relaxed);
     }
-    fn download_done(&self) {
-        self.downloads.fetch_add(1, Ordering::Relaxed);
+    fn download_done(&self, size: u64) {
+        self.downloads_count.fetch_add(1, Ordering::Relaxed);
+        self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
     }
     fn timeline_restart_done(&self) {
         self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -92,28 +103,49 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
     )
     .await?;
 
+    let token = CancellationToken::new();
     let mut tasks = JoinSet::new();
 
-    let live_stats = Arc::new(LiveStats::default());
+    let periodic_stats = Arc::new(LiveStats::default());
+    let total_stats = Arc::new(LiveStats::default());
+
+    let start = Instant::now();
     tasks.spawn({
-        let live_stats = Arc::clone(&live_stats);
+        let periodic_stats = Arc::clone(&periodic_stats);
+        let total_stats = Arc::clone(&total_stats);
+        let cloned_token = token.clone();
         async move {
             let mut last_at = Instant::now();
             loop {
+                if cloned_token.is_cancelled() {
+                    return;
+                }
                 tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
                 let now = Instant::now();
                 let delta: Duration = now - last_at;
                 last_at = now;
 
                 let LiveStats {
-                    evictions,
-                    downloads,
+                    evictions_count,
+                    downloads_count,
+                    downloads_bytes,
                     timeline_restarts,
-                } = &*live_stats;
-                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
-                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                } = &*periodic_stats;
+                let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
+                let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
+                let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
                 let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
+
+                total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
+                total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
+                total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
+                total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
+
+                let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
+                let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
+                let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
+
+                info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
             }
         }
     });
@@ -124,14 +156,42 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                 args,
                 Arc::clone(&mgmt_api_client),
                 tl,
-                Arc::clone(&live_stats),
+                Arc::clone(&periodic_stats),
+                token.clone(),
             ));
         }
     }
+    if let Some(runtime) = args.runtime {
+        tokio::spawn(async move {
+            tokio::time::sleep(runtime.into()).await;
+            token.cancel();
+        });
+    }
 
     while let Some(res) = tasks.join_next().await {
         res.unwrap();
     }
+    let end = Instant::now();
+    let duration: Duration = end - start;
+
+    let output = {
+        let LiveStats {
+            evictions_count,
+            downloads_count,
+            downloads_bytes,
+            timeline_restarts,
+        } = &*total_stats;
+        Output {
+            downloads_count: downloads_count.load(Ordering::Relaxed),
+            downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
+            evictions_count: evictions_count.load(Ordering::Relaxed),
+            timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
+            runtime: duration,
+        }
+    };
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
     Ok(())
 }
 
@@ -140,6 +200,7 @@ async fn timeline_actor(
     mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
     timeline: TenantTimelineId,
     live_stats: Arc<LiveStats>,
+    token: CancellationToken,
 ) {
     // TODO: support sharding
     let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -149,7 +210,7 @@ async fn timeline_actor(
         layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
         concurrency: Arc<tokio::sync::Semaphore>,
     }
-    loop {
+    while !token.is_cancelled() {
         debug!("restarting timeline");
         let layer_map_info = mgmt_api_client
             .layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -185,7 +246,7 @@ async fn timeline_actor(
 
         live_stats.timeline_restart_done();
 
-        loop {
+        while !token.is_cancelled() {
             assert!(!timeline.joinset.is_empty());
             if let Some(res) = timeline.joinset.try_join_next() {
                 debug!(?res, "a layer actor exited, should not happen");
@@ -255,7 +316,7 @@ async fn layer_actor(
                     .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
                     .await
                     .unwrap();
-                live_stats.download_done();
+                live_stats.download_done(layer.layer_file_size());
                 did_it
             }
         };
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 743102d853..5527557450 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -14,6 +14,7 @@ mod util {
 
 /// The pagebench CLI sub-commands, dispatched in [`main`] below.
 mod cmd {
+    pub(super) mod aux_files;
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
     pub(super) mod ondemand_download_churn;
@@ -27,6 +28,7 @@ enum Args {
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
     OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
+    AuxFiles(cmd::aux_files::Args),
 }
 
 fn main() {
@@ -46,6 +48,7 @@ fn main() {
             cmd::trigger_initial_size_calculation::main(args)
         }
         Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Args::AuxFiles(args) => cmd::aux_files::main(args),
     }
     .unwrap()
 }
diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs
index a26ed84a0d..38e1875db1 100644
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,15 +1,39 @@
+use std::sync::Arc;
+
+use ::metrics::IntGauge;
 use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;
 
-/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+// BEGIN Copyright (c) 2017 Servo Contributors
+
+/// Const version of FNV hash.
+#[inline]
+#[must_use]
+pub const fn fnv_hash(bytes: &[u8]) -> u128 {
+    const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
+    const PRIME: u128 = 0x0000000001000000000000000000013B;
+
+    let mut hash = INITIAL_STATE;
+    let mut i = 0;
+    while i < bytes.len() {
+        hash ^= bytes[i] as u128;
+        hash = hash.wrapping_mul(PRIME);
+        i += 1;
+    }
+    hash
+}
+
+// END Copyright (c) 2017 Servo Contributors
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
 fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
-    let mut key = [0; METADATA_KEY_SIZE];
-    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
+    let hash = fnv_hash(data).to_be_bytes();
     key[0] = AUX_KEY_PREFIX;
     key[1] = dir_level1;
     key[2] = dir_level2;
-    key[3..16].copy_from_slice(&hash[0..13]);
+    key[3..16].copy_from_slice(&hash[3..16]);
     Key::from_metadata_key_fixed_size(&key)
 }
 
@@ -140,6 +164,55 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
     Ok(encoded)
 }
 
+/// An estimation of the size of aux files.
+pub struct AuxFileSizeEstimator {
+    aux_file_size_gauge: IntGauge,
+    size: Arc<std::sync::Mutex<Option<isize>>>,
+}
+
+impl AuxFileSizeEstimator {
+    pub fn new(aux_file_size_gauge: IntGauge) -> Self {
+        Self {
+            aux_file_size_gauge,
+            size: Arc::new(std::sync::Mutex::new(None)),
+        }
+    }
+
+    pub fn on_base_backup(&self, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        *guard = Some(new_size as isize);
+        self.report(new_size as isize);
+    }
+
+    pub fn on_add(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_remove(&self, file_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size -= file_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn on_update(&self, old_size: usize, new_size: usize) {
+        let mut guard = self.size.lock().unwrap();
+        if let Some(size) = &mut *guard {
+            *size += new_size as isize - old_size as isize;
+            self.report(*size);
+        }
+    }
+
+    pub fn report(&self, size: isize) {
+        self.aux_file_size_gauge.set(size as i64);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -148,15 +221,19 @@ mod tests {
     fn test_hash_portable() {
         // AUX file encoding requires the hash to be portable across all platforms. This test case checks
         // if the algorithm produces the same hash across different environments.
+
         assert_eq!(
-            305317690835051308206966631765527126151,
-            twox_hash::xxh3::hash128("test1".as_bytes())
+            265160408618497461376862998434862070044,
+            super::fnv_hash("test1".as_bytes())
         );
         assert_eq!(
-            85104974691013376326742244813280798847,
-            twox_hash::xxh3::hash128("test/test2".as_bytes())
+            295486155126299629456360817749600553988,
+            super::fnv_hash("test/test2".as_bytes())
+        );
+        assert_eq!(
+            144066263297769815596495629667062367629,
+            super::fnv_hash("".as_bytes())
         );
-        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
     }
 
     #[test]
@@ -164,28 +241,28 @@ mod tests {
         // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
         // of the page server.
         assert_eq!(
-            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
-            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+            "62000001017F8B83D94F7081693471ABF91C",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string(),
         );
         assert_eq!(
-            "620000010239AAC544893139B26F501B97E6",
-            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+            "62000001027F8E83D94F7081693471ABFCCD",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
         );
         assert_eq!(
-            "620000010300000000000000000000000000",
-            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+            "62000001032E07BB014262B821756295C58D",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
         );
         assert_eq!(
-            "62000001FF8635AF2134B7266EC5B4189FD6",
-            encode_aux_file_key("pg_logical/unsupported").to_string()
+            "62000001FF4F38E1C74754E7D03C1A660178",
+            encode_aux_file_key("pg_logical/unsupported").to_string(),
         );
         assert_eq!(
-            "6200000201772D0E5D71DE14DA86142A1619",
+            "62000002017F8D83D94F7081693471ABFB92",
             encode_aux_file_key("pg_replslot/test3").to_string()
         );
         assert_eq!(
-            "620000FFFF1866EBEB53B807B26A2416F317",
-            encode_aux_file_key("other_file_not_supported").to_string()
+            "620000FFFF2B6ECC8AEF93F643DC44F15E03",
+            encode_aux_file_key("other_file_not_supported").to_string(),
         );
     }
 
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 58b18dae7d..dca1510810 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -601,7 +601,7 @@ where
         // add zenith.signal file
         let mut zenith_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
-            if self.lsn == self.timeline.get_ancestor_lsn() {
+            if self.timeline.is_ancestor_lsn(self.lsn) {
                 write!(zenith_signal, "PREV LSN: none")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index eb4b8bb8bb..ba5b2608bd 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -284,7 +284,6 @@ fn start_pageserver(
     ))
     .unwrap();
     pageserver::preinitialize_metrics();
-    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
@@ -516,16 +515,12 @@ fn start_pageserver(
         }
     });
 
-    let secondary_controller = if let Some(remote_storage) = &remote_storage {
-        secondary::spawn_tasks(
-            tenant_manager.clone(),
-            remote_storage.clone(),
-            background_jobs_barrier.clone(),
-            shutdown_pageserver.clone(),
-        )
-    } else {
-        secondary::null_controller()
-    };
+    let secondary_controller = secondary::spawn_tasks(
+        tenant_manager.clone(),
+        remote_storage.clone(),
+        background_jobs_barrier.clone(),
+        shutdown_pageserver.clone(),
+    );
 
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -533,15 +528,13 @@ fn start_pageserver(
     // been configured.
     let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
 
-    if let Some(remote_storage) = &remote_storage {
-        launch_disk_usage_global_eviction_task(
-            conf,
-            remote_storage.clone(),
-            disk_usage_eviction_state.clone(),
-            tenant_manager.clone(),
-            background_jobs_barrier.clone(),
-        )?;
-    }
+    launch_disk_usage_global_eviction_task(
+        conf,
+        remote_storage.clone(),
+        disk_usage_eviction_state.clone(),
+        tenant_manager.clone(),
+        background_jobs_barrier.clone(),
+    )?;
 
     // Start up the service to handle HTTP mgmt API request. We created the
     // listener earlier already.
@@ -654,17 +647,20 @@ fn start_pageserver(
             None,
             "libpq endpoint listener",
             true,
-            async move {
-                page_service::libpq_listener_main(
-                    conf,
-                    broker_client,
-                    pg_auth,
-                    pageserver_listener,
-                    conf.pg_auth_type,
-                    libpq_ctx,
-                    task_mgr::shutdown_token(),
-                )
-                .await
+            {
+                let tenant_manager = tenant_manager.clone();
+                async move {
+                    page_service::libpq_listener_main(
+                        tenant_manager,
+                        broker_client,
+                        pg_auth,
+                        pageserver_listener,
+                        conf.pg_auth_type,
+                        libpq_ctx,
+                        task_mgr::shutdown_token(),
+                    )
+                    .await
+                }
             },
         );
     }
@@ -693,14 +689,7 @@ fn start_pageserver(
             // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
             // The plan is to change that over time.
             shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            pageserver::shutdown_pageserver(
-                &tenant_manager,
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            )
-            .await;
+            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
             unreachable!()
         })
     }
@@ -708,12 +697,11 @@ fn start_pageserver(
 
 fn create_remote_storage_client(
     conf: &'static PageServerConf,
-) -> anyhow::Result<Option<GenericRemoteStorage>> {
+) -> anyhow::Result<GenericRemoteStorage> {
     let config = if let Some(config) = &conf.remote_storage_config {
         config
     } else {
-        tracing::warn!("no remote storage configured, this is a deprecated configuration");
-        return Ok(None);
+        anyhow::bail!("no remote storage configured, this is a deprecated configuration");
     };
 
     // Create the client
@@ -733,7 +721,7 @@ fn create_remote_storage_client(
             GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
     }
 
-    Ok(Some(remote_storage))
+    Ok(remote_storage)
 }
 
 fn cli() -> Command {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 258eed0b12..b0afb6414b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -99,7 +99,7 @@ pub mod defaults {
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
 
     ///
     /// Default built-in configuration file.
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e3c11cb299..8790a9b0a8 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
 use list_writer::ListWriterQueueMessage;
 use validator::ValidatorQueueMessage;
 
-use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
+use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
 
 // TODO: configurable for how long to wait before executing deletions
 
@@ -479,7 +479,7 @@ impl DeletionQueueClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> Result<(), DeletionQueueError> {
         if current_generation.is_none() {
             debug!("Enqueuing deletions in legacy mode, skipping queue");
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         current_generation: Generation,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> Result<(), DeletionQueueError> {
         metrics::DELETION_QUEUE
             .keys_submitted
@@ -632,7 +632,7 @@ impl DeletionQueue {
     ///
     /// If remote_storage is None, then the returned workers will also be None.
     pub fn new<C>(
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         control_plane_client: Option<C>,
         conf: &'static PageServerConf,
     ) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,23 +658,6 @@ impl DeletionQueue {
         // longer to flush after Tenants have all been torn down.
         let cancel = CancellationToken::new();
 
-        let remote_storage = match remote_storage {
-            None => {
-                return (
-                    Self {
-                        client: DeletionQueueClient {
-                            tx,
-                            executor_tx,
-                            lsn_table: lsn_table.clone(),
-                        },
-                        cancel,
-                    },
-                    None,
-                )
-            }
-            Some(r) => r,
-        };
-
         (
             Self {
                 client: DeletionQueueClient {
@@ -734,20 +717,20 @@ mod test {
     use crate::{
         control_plane_client::RetryForeverError,
         repository::Key,
-        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
+        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
     };
 
     use super::*;
     pub const TIMELINE_ID: TimelineId =
         TimelineId::from_array(hex!("11223344556677881122334455667788"));
 
-    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
         lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
     });
 
     // When you need a second layer in a test.
-    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
         key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
         lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
     });
@@ -765,7 +748,7 @@ mod test {
         /// Simulate a pageserver restart by destroying and recreating the deletion queue
         async fn restart(&mut self) {
             let (deletion_queue, workers) = DeletionQueue::new(
-                Some(self.storage.clone()),
+                self.storage.clone(),
                 Some(self.mock_control_plane.clone()),
                 self.harness.conf,
             );
@@ -797,7 +780,7 @@ mod test {
         /// Returns remote layer file name, suitable for use in assert_remote_files
         fn write_remote_layer(
             &self,
-            file_name: LayerFileName,
+            file_name: LayerName,
             gen: Generation,
         ) -> anyhow::Result<String> {
             let tenant_shard_id = self.harness.tenant_shard_id;
@@ -875,7 +858,7 @@ mod test {
         let mock_control_plane = MockControlPlane::new();
 
         let (deletion_queue, worker) = DeletionQueue::new(
-            Some(storage.clone()),
+            storage.clone(),
             Some(mock_control_plane.clone()),
             harness.conf,
         );
@@ -952,7 +935,7 @@ mod test {
         let client = ctx.deletion_queue.new_client();
         client.recover(HashMap::new())?;
 
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let tenant_shard_id = ctx.harness.tenant_shard_id;
 
         let content: Vec<u8> = "victim1 contents".into();
diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs
index 3a3d600ac2..ae3b2c9180 100644
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
 
@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
     // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
     // have a config object handy to project it to a remote key, and need the consuming worker
     // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
+    pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
     pub(super) objects: Vec<RemotePath>,
 
     /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 400930245b..7f25e49570 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -64,7 +64,7 @@ use crate::{
         mgr::TenantManager,
         remote_timeline_client::LayerFileMetadata,
         secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
     },
 };
 
@@ -535,17 +535,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                 }
                 EvictionLayer::Secondary(layer) => {
                     let file_size = layer.metadata.file_size();
-                    let tenant_manager = tenant_manager.clone();
 
                     js.spawn(async move {
                         layer
                             .secondary_tenant
-                            .evict_layer(
-                                tenant_manager.get_conf(),
-                                layer.timeline_id,
-                                layer.name,
-                                layer.metadata,
-                            )
+                            .evict_layer(layer.timeline_id, layer.name)
                             .await;
                         Ok(file_size)
                     });
@@ -604,7 +598,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 pub(crate) struct EvictionSecondaryLayer {
     pub(crate) secondary_tenant: Arc<SecondaryTenant>,
     pub(crate) timeline_id: TimelineId,
-    pub(crate) name: LayerFileName,
+    pub(crate) name: LayerName,
     pub(crate) metadata: LayerFileMetadata,
 }
 
@@ -637,9 +631,9 @@ impl EvictionLayer {
         }
     }
 
-    pub(crate) fn get_name(&self) -> LayerFileName {
+    pub(crate) fn get_name(&self) -> LayerName {
         match self {
-            Self::Attached(l) => l.layer_desc().filename(),
+            Self::Attached(l) => l.layer_desc().layer_name(),
             Self::Secondary(sl) => sl.name.clone(),
         }
     }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index c425f3e628..e5eafc51f4 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -257,6 +257,37 @@ paths:
               schema:
                 $ref: "#/components/schemas/LsnByTimestampResponse"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: Obtain lease for the given LSN
+      parameters:
+        - name: lsn
+          in: query
+          required: true
+          schema:
+            type: string
+            format: hex
+          description: A LSN to obtain the lease for
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LsnLease"
+
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
     parameters:
       - name: tenant_id
@@ -420,25 +451,6 @@ paths:
           description: Tenant scheduled to load successfully
 
   /v1/tenant/{tenant_id}/synthetic_size:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    get:
-      description: |
-        Calculate tenant's synthetic size
-      responses:
-        "200":
-          description: Tenant's synthetic size
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/SyntheticSizeResponse"
-
-  # This route has no handler. TODO: remove?
-  /v1/tenant/{tenant_id}/size:
     parameters:
       - name: tenant_id
         in: path
@@ -468,19 +480,9 @@ paths:
           content:
             application/json:
               schema:
-                type: object
-                required:
-                  - id
-                  - size
-                properties:
-                  id:
-                    type: string
-                    format: hex
-                  size:
-                    type: integer
-                    nullable: true
-                    description: |
-                      Size metric in bytes or null if inputs_only=true was given.
+                $ref: "#/components/schemas/SyntheticSizeResponse"
+            text/html:
+              description: SVG representation of the tenant and it's timelines.
         "401":
           description: Unauthorized Error
           content:
@@ -610,6 +612,80 @@ paths:
               schema:
                 $ref: "#/components/schemas/Error"
 
+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        ŕequired: true
+        schema:
+          type: string
+
+    put:
+      description: |
+        Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
+        Current implementation might not be retryable across failure cases, but will be enhanced in future.
+        Detaching should be expected to be expensive operation. Timeouts should be retried.
+      responses:
+        "200":
+          description: |
+            The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
+            If any timelines were deleted after reparenting, they might not be on this list.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AncestorDetached"
+
+        "400":
+          description: |
+            Number of early checks meaning the timeline cannot be detached now:
+              - the ancestor of timeline has an ancestor: not supported, see RFC
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "404":
+          description: Tenant or timeline not found.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+
+        "409":
+          description: |
+            The timeline can never be detached:
+              - timeline has no ancestor, implying that the timeline has never had an ancestor
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+
+        "500":
+          description: |
+            Transient error, for example, pageserver shutdown happened while
+            processing the request but we were unable to distinguish that. Must
+            be retried.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "503":
+          description: |
+            Temporarily unavailable, please retry. Possible reasons:
+              - another timeline detach for the same tenant is underway, please retry later
+              - detected shutdown error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+
   /v1/tenant/:
     get:
       description: Get tenants list
@@ -929,6 +1005,9 @@ components:
           format: hex
         size:
           type: integer
+          nullable: true
+          description: |
+            Size metric in bytes or null if inputs_only=true was given.
         segment_sizes:
           type: array
           items:
@@ -1006,6 +1085,15 @@ components:
           type: string
           enum: [past, present, future, nodata]
 
+    LsnLease:
+      type: object
+      required:
+        - valid_until
+      properties:
+        valid_until:
+          type: string
+          format: date-time
+
     PageserverUtilization:
       type: object
       required:
@@ -1063,6 +1151,19 @@ components:
           format: int64
           description: How many bytes of layer content were in the latest layer heatmap
 
+    AncestorDetached:
+      type: object
+      required:
+        - reparented_timelines
+      properties:
+        reparented_timelines:
+          type: array
+          description: Set of reparented timeline ids
+          properties:
+            type: string
+            format: hex
+            description: TimelineId
+
 
     Error:
       type: object
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 83b7b8a45e..7b55e88096 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,6 +1,8 @@
 //!
 //! Management HTTP API
 //!
+use std::cmp::Reverse;
+use std::collections::BinaryHeap;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -14,6 +16,9 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::IngestAuxFilesRequest;
+use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
@@ -24,7 +29,11 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
+use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
+use pageserver_api::models::TopTenantShardItem;
+use pageserver_api::models::TopTenantShardsRequest;
+use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
@@ -63,9 +72,10 @@ use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
+use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -104,7 +114,7 @@ pub struct State {
     tenant_manager: Arc<TenantManager>,
     auth: Option<Arc<SwappableJwtAuth>>,
     allowlist_routes: Vec<Uri>,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
     broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     deletion_queue_client: DeletionQueueClient,
@@ -118,7 +128,7 @@ impl State {
         conf: &'static PageServerConf,
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
         deletion_queue_client: DeletionQueueClient,
@@ -271,6 +281,13 @@ impl From<GetTenantError> for ApiError {
     }
 }
 
+impl From<GetTimelineError> for ApiError {
+    fn from(gte: GetTimelineError) -> Self {
+        // Rationale: tenant is activated only after eligble timelines activate
+        ApiError::NotFound(gte.into())
+    }
+}
+
 impl From<GetActiveTenantError> for ApiError {
     fn from(e: GetActiveTenantError) -> ApiError {
         match e {
@@ -433,6 +450,8 @@ async fn build_timeline_info_common(
         state,
 
         walreceiver_status,
+
+        last_aux_file_policy: timeline.last_aux_file_policy.load(),
     };
     Ok(info)
 }
@@ -633,9 +652,7 @@ async fn timeline_preserve_initdb_handler(
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;
 
         timeline
             .preserve_initdb_archive()
@@ -677,9 +694,7 @@ async fn timeline_detail_handler(
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;
 
         let timeline_info = build_timeline_info(
             &timeline,
@@ -813,12 +828,6 @@ async fn tenant_attach_handler(
 
     let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
 
-    if state.remote_storage.is_none() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "attach_tenant is not possible because pageserver was configured without remote storage"
-        )));
-    }
-
     let tenant_shard_id = TenantShardId::unsharded(tenant_id);
     let shard_params = ShardParameters::default();
     let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
@@ -1229,7 +1238,7 @@ async fn layer_download_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let layer_name = LayerFileName::from_str(layer_file_name)
+    let layer_name = LayerName::from_str(layer_file_name)
         .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
     let state = get_state(&request);
 
@@ -1261,7 +1270,7 @@ async fn evict_timeline_layer_handler(
     let layer_file_name = get_request_param(&request, "layer_file_name")?;
     let state = get_state(&request);
 
-    let layer_name = LayerFileName::from_str(layer_file_name)
+    let layer_name = LayerName::from_str(layer_file_name)
         .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
 
     let timeline =
@@ -1643,12 +1652,6 @@ async fn tenant_time_travel_remote_storage_handler(
         )));
     }
 
-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run time travel"
-        )));
-    };
-
     if timestamp > done_if_after {
         return Err(ApiError::BadRequest(anyhow!(
             "The done_if_after timestamp comes before the timestamp to recover to"
@@ -1658,7 +1661,7 @@ async fn tenant_time_travel_remote_storage_handler(
     tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
 
     remote_timeline_client::upload::time_travel_recover_tenant(
-        storage,
+        &state.remote_storage,
         &tenant_shard_id,
         timestamp,
         done_if_after,
@@ -1703,6 +1706,32 @@ async fn handle_tenant_break(
     json_response(StatusCode::OK, ())
 }
 
+// Obtains an lsn lease on the given timeline.
+async fn lsn_lease_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let lsn: Lsn = parse_query_param(&request, "lsn")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    let result = timeline
+        .make_lsn_lease(lsn, &ctx)
+        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
+
+    json_response(StatusCode::OK, result)
+}
+
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
     mut request: Request<Body>,
@@ -1715,12 +1744,7 @@ async fn timeline_gc_handler(
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
-    let gc_result = wait_task_done
-        .await
-        .context("wait for gc task")
-        .map_err(ApiError::InternalServerError)?
-        .map_err(ApiError::InternalServerError)?;
+    let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
 
     json_response(StatusCode::OK, gc_result)
 }
@@ -1743,6 +1767,8 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    let wait_until_uploaded =
+        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1751,6 +1777,9 @@ async fn timeline_compact_handler(
             .compact(&cancel, flags, &ctx)
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        if wait_until_uploaded {
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+        }
         json_response(StatusCode::OK, ())
     }
     .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1775,6 +1804,8 @@ async fn timeline_checkpoint_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
         flags |= CompactFlags::ForceImageLayerCreation;
     }
+    let wait_until_uploaded =
+        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1788,6 +1819,10 @@ async fn timeline_checkpoint_handler(
             .await
             .map_err(|e| ApiError::InternalServerError(e.into()))?;
 
+        if wait_until_uploaded {
+            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
+        }
+
         json_response(StatusCode::OK, ())
     }
     .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1871,14 +1906,11 @@ async fn timeline_detach_ancestor_handler(
         let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
         let ctx = &ctx;
 
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;
 
         let (_guard, prepared) = timeline
             .prepare_to_detach_from_ancestor(&tenant, options, ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .await?;
 
         let res = state
             .tenant_manager
@@ -1908,11 +1940,6 @@ async fn deletion_queue_flush(
 ) -> Result<Response<Body>, ApiError> {
     let state = get_state(&r);
 
-    if state.remote_storage.is_none() {
-        // Nothing to do if remote storage is disabled.
-        return json_response(StatusCode::OK, ());
-    }
-
     let execute = parse_query_param(&r, "execute")?.unwrap_or(false);
 
     let flush = async {
@@ -2017,9 +2044,7 @@ async fn active_timeline_of_active_tenant(
 
     tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
-    tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))
+    Ok(tenant.get_timeline(timeline_id, true)?)
 }
 
 async fn always_panic_handler(
@@ -2077,18 +2102,11 @@ async fn disk_usage_eviction_run(
     };
 
     let state = get_state(&r);
-
-    let Some(storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "remote storage not configured, cannot run eviction iteration"
-        )));
-    };
-
     let eviction_state = state.disk_usage_eviction_state.clone();
 
     let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
         &eviction_state,
-        storage,
+        &state.remote_storage,
         usage,
         &state.tenant_manager,
         config.eviction_order,
@@ -2125,29 +2143,23 @@ async fn tenant_scan_remote_handler(
     let state = get_state(&request);
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
 
-    let Some(remote_storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Remote storage not configured"
-        )));
-    };
-
     let mut response = TenantScanRemoteStorageResponse::default();
 
     let (shards, _other_keys) =
-        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
+        list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
             .await
             .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
     for tenant_shard_id in shards {
         let (timeline_ids, _other_keys) =
-            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
+            list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
                 .await
                 .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
         let mut generation = Generation::none();
         for timeline_id in timeline_ids {
             match download_index_part(
-                remote_storage,
+                &state.remote_storage,
                 &tenant_shard_id,
                 &timeline_id,
                 Generation::MAX,
@@ -2296,6 +2308,31 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn force_aux_policy_switch_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
+    let policy: AuxFilePolicy = json_request(&mut r).await?;
+
+    let state = get_state(&r);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    timeline
+        .do_switch_aux_policy(policy)
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
@@ -2358,6 +2395,150 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+async fn list_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: ListAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+    json_response(StatusCode::OK, files)
+}
+
+async fn ingest_aux_files(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let body: IngestAuxFilesRequest = json_request(&mut request).await?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let mut modification = timeline.begin_modification(
+        Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
+    );
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    for (fname, content) in body.aux_files {
+        modification
+            .put_file(&fname, content.as_bytes(), &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+    }
+    modification
+        .commit(&ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
+/// Report on the largest tenants on this pageserver, for the storage controller to identify
+/// candidates for splitting
+async fn post_top_tenants(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let request: TopTenantShardsRequest = json_request(&mut r).await?;
+    let state = get_state(&r);
+
+    fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
+        match order_by {
+            TenantSorting::ResidentSize => sizes.resident_size,
+            TenantSorting::MaxLogicalSize => sizes.max_logical_size,
+        }
+    }
+
+    #[derive(Eq, PartialEq)]
+    struct HeapItem {
+        metric: u64,
+        sizes: TopTenantShardItem,
+    }
+
+    impl PartialOrd for HeapItem {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
+    /// supports popping the greatest item but not the smallest.
+    impl Ord for HeapItem {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            Reverse(self.metric).cmp(&Reverse(other.metric))
+        }
+    }
+
+    let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
+
+    // FIXME: this is a lot of clones to take this tenant list
+    for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
+        if let Some(shards_lt) = request.where_shards_lt {
+            // Ignore tenants which already have >= this many shards
+            if tenant_shard_id.shard_count >= shards_lt {
+                continue;
+            }
+        }
+
+        let sizes = match tenant_slot {
+            TenantSlot::Attached(tenant) => tenant.get_sizes(),
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        };
+        let metric = get_size_metric(&sizes, &request.order_by);
+
+        if let Some(gt) = request.where_gt {
+            // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
+            if metric <= gt {
+                continue;
+            }
+        };
+
+        match top_n.peek() {
+            None => {
+                // Top N list is empty: candidate becomes first member
+                top_n.push(HeapItem { metric, sizes });
+            }
+            Some(i) if i.metric > metric && top_n.len() < request.limit => {
+                // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
+                top_n.push(HeapItem { metric, sizes });
+            }
+            Some(i) if i.metric > metric => {
+                // List is at limit and lowest value is greater than our candidate, drop it.
+            }
+            Some(_) => top_n.push(HeapItem { metric, sizes }),
+        }
+
+        while top_n.len() > request.limit {
+            top_n.pop();
+        }
+    }
+
+    json_response(
+        StatusCode::OK,
+        TopTenantShardsResponse {
+            shards: top_n.into_iter().map(|i| i.sizes).collect(),
+        },
+    )
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2570,6 +2751,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
             |r| api_handler(r, get_timestamp_of_lsn_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
+            |r| api_handler(r, lsn_lease_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
             |r| api_handler(r, timeline_gc_handler),
@@ -2643,6 +2828,19 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
+            |r| api_handler(r, force_aux_policy_switch_handler),
+        )
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
+            |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
+        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
+            |r| testing_api_handler("list_aux_files", r, list_aux_files),
+        )
+        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
         .any(handler_404))
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 930700e50c..c69fb8c83b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
 #[tracing::instrument(skip_all, fields(%exit_code))]
 pub async fn shutdown_pageserver(
     tenant_manager: &TenantManager,
-    deletion_queue: Option<DeletionQueue>,
+    mut deletion_queue: DeletionQueue,
     exit_code: i32,
 ) {
     use std::time::Duration;
@@ -89,9 +89,7 @@ pub async fn shutdown_pageserver(
     .await;
 
     // Best effort to persist any outstanding deletions, to avoid leaking objects
-    if let Some(mut deletion_queue) = deletion_queue {
-        deletion_queue.shutdown(Duration::from_secs(5)).await;
-    }
+    deletion_queue.shutdown(Duration::from_secs(5)).await;
 
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
@@ -114,10 +112,6 @@ pub async fn shutdown_pageserver(
     std::process::exit(exit_code);
 }
 
-/// The name of the metadata file pageserver creates per timeline.
-/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
-pub const METADATA_FILE_NAME: &str = "metadata";
-
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
 pub(crate) const TENANT_CONFIG_NAME: &str = "config";
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 256f2f334c..4f2c75d308 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_standby_horizon",
+        "Standby apply LSN for which GC is hold off, by timeline.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_resident_physical_size",
@@ -585,6 +594,15 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_aux_file_estimated_size",
+        "The size of all aux files for a timeline in aux file v2 store.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) mod initial_logical_size {
     use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -1849,7 +1867,6 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_received: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
-    pub(crate) time_spent_on_ingest: Histogram,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1873,12 +1890,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
-    time_spent_on_ingest: register_histogram!(
-        "pageserver_wal_ingest_put_value_seconds",
-        "Actual time spent on ingesting a record",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric"),
 });
 
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -1990,29 +2001,6 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
-#[cfg(not(test))]
-pub mod wal_redo {
-    use super::*;
-
-    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
-        std::sync::Mutex::new(
-            register_uint_gauge_vec!(
-                "pageserver_wal_redo_process_kind",
-                "The configured process kind for walredo",
-                &["kind"],
-            )
-            .unwrap(),
-        )
-    });
-
-    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
-        // use guard to avoid races around the next two steps
-        let guard = PROCESS_KIND.lock().unwrap();
-        guard.reset();
-        guard.with_label_values(&[&format!("{kind}")]).set(1);
-    }
-}
-
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
@@ -2112,9 +2100,11 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
-    resident_physical_size_gauge: UIntGauge,
+    pub standby_horizon_gauge: IntGauge,
+    pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
+    pub aux_file_size_gauge: IntGauge,
     pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
     pub evictions: IntCounter,
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
@@ -2180,6 +2170,9 @@ impl TimelineMetrics {
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let standby_horizon_gauge = STANDBY_HORIZON
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2187,6 +2180,9 @@ impl TimelineMetrics {
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let aux_file_size_gauge = AUX_FILE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
         let directory_entries_count_gauge_closure = {
             let tenant_shard_id = *tenant_shard_id;
@@ -2222,8 +2218,10 @@ impl TimelineMetrics {
             find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
+            standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
+            aux_file_size_gauge,
             directory_entries_count_gauge,
             evictions,
             evictions_with_low_residence_duration: std::sync::RwLock::new(
@@ -2255,6 +2253,7 @@ impl TimelineMetrics {
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2264,6 +2263,7 @@ impl TimelineMetrics {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
@@ -2320,6 +2320,7 @@ use pin_project_lite::pin_project;
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
+use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
@@ -2329,35 +2330,35 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
-struct PerTimelineRemotePhysicalSizeGauge {
-    last_set: u64,
+pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: AtomicU64,
     gauge: UIntGauge,
 }
 
 impl PerTimelineRemotePhysicalSizeGauge {
     fn new(per_timeline_gauge: UIntGauge) -> Self {
         Self {
-            last_set: per_timeline_gauge.get(),
+            last_set: AtomicU64::new(0),
             gauge: per_timeline_gauge,
         }
     }
-    fn set(&mut self, sz: u64) {
+    pub(crate) fn set(&self, sz: u64) {
         self.gauge.set(sz);
-        if sz < self.last_set {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+        let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
+        if sz < prev {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
         } else {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
         };
-        self.last_set = sz;
     }
-    fn get(&self) -> u64 {
+    pub(crate) fn get(&self) -> u64 {
         self.gauge.get()
     }
 }
 
 impl Drop for PerTimelineRemotePhysicalSizeGauge {
     fn drop(&mut self) {
-        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
     }
 }
 
@@ -2365,7 +2366,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     tenant_id: String,
     shard_id: String,
     timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
+    pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
     calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -2373,38 +2374,27 @@ pub(crate) struct RemoteTimelineClientMetrics {
 
 impl RemoteTimelineClientMetrics {
     pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+        let tenant_id_str = tenant_shard_id.tenant_id.to_string();
+        let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
+        let timeline_id_str = timeline_id.to_string();
+
+        let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
+            REMOTE_PHYSICAL_SIZE
+                .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+                .unwrap(),
+        );
+
         RemoteTimelineClientMetrics {
-            tenant_id: tenant_shard_id.tenant_id.to_string(),
-            shard_id: format!("{}", tenant_shard_id.shard_slug()),
-            timeline_id: timeline_id.to_string(),
+            tenant_id: tenant_id_str,
+            shard_id: shard_id_str,
+            timeline_id: timeline_id_str,
             calls: Mutex::new(HashMap::default()),
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
-            remote_physical_size_gauge: Mutex::new(None),
+            remote_physical_size_gauge,
         }
     }
 
-    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
-        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        let gauge = guard.get_or_insert_with(|| {
-            PerTimelineRemotePhysicalSizeGauge::new(
-                REMOTE_PHYSICAL_SIZE
-                    .get_metric_with_label_values(&[
-                        &self.tenant_id,
-                        &self.shard_id,
-                        &self.timeline_id,
-                    ])
-                    .unwrap(),
-            )
-        });
-        gauge.set(sz);
-    }
-
-    pub(crate) fn remote_physical_size_get(&self) -> u64 {
-        let guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
-    }
-
     pub fn remote_operation_time(
         &self,
         file_kind: &RemoteOpFileKind,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index f6b251283c..d250864fd6 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,6 +19,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
+use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -32,6 +33,8 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::Instant;
+use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
@@ -49,7 +52,6 @@ use utils::{
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
-use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
@@ -59,13 +61,15 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant::mgr;
-use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::mgr::GetTenantError;
+use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
+use crate::tenant::mgr::TenantManager;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
+use crate::tenant::Tenant;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
@@ -135,7 +139,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 /// Listens for connections, and launches a new handler task for each.
 ///
 pub async fn libpq_listener_main(
-    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: TcpListener,
@@ -180,7 +184,7 @@ pub async fn libpq_listener_main(
                     "serving compute connection task",
                     false,
                     page_service_conn_main(
-                        conf,
+                        tenant_manager.clone(),
                         broker_client.clone(),
                         local_auth,
                         socket,
@@ -203,7 +207,7 @@ pub async fn libpq_listener_main(
 
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
-    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
@@ -260,7 +264,8 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler =
+        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
@@ -291,11 +296,12 @@ struct HandlerTimeline {
 }
 
 struct PageServerHandler {
-    _conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
+    tenant_manager: Arc<TenantManager>,
+
     /// The context created for the lifetime of the connection
     /// services by this PageServerHandler.
     /// For each query received over the connection,
@@ -381,13 +387,13 @@ impl From<WaitLsnError> for QueryError {
 
 impl PageServerHandler {
     pub fn new(
-        conf: &'static PageServerConf,
+        tenant_manager: Arc<TenantManager>,
         broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
-            _conf: conf,
+            tenant_manager,
             broker_client,
             auth,
             claims: None,
@@ -552,13 +558,9 @@ impl PageServerHandler {
     {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
-        let tenant = mgr::get_active_tenant_with_timeout(
-            tenant_id,
-            ShardSelector::First,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
+            .await?;
 
         // Make request tracer if needed
         let mut tracer = if tenant.get_trace_read_requests() {
@@ -726,13 +728,9 @@ impl PageServerHandler {
 
         // Create empty timeline
         info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            ShardSelector::Zero,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
+            .await?;
         let timeline = tenant
             .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
             .await?;
@@ -909,6 +907,39 @@ impl PageServerHandler {
         }
     }
 
+    #[instrument(skip_all, fields(shard_id, %lsn))]
+    async fn handle_make_lsn_lease<IO>(
+        &self,
+        pgb: &mut PostgresBackend<IO>,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
+        let timeline = self
+            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
+            .await?;
+        let lease = timeline.make_lsn_lease(lsn, ctx)?;
+        let valid_until = lease
+            .valid_until
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .map_err(|e| QueryError::Other(e.into()))?;
+
+        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
+            b"valid_until",
+        )]))?
+        .write_message_noflush(&BeMessage::DataRow(&[Some(
+            &valid_until.as_millis().to_be_bytes(),
+        )]))?
+        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+
+        Ok(())
+    }
+
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_rel_exists_request(
         &mut self,
@@ -1370,18 +1401,69 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         selector: ShardSelector,
     ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            selector,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        .map_err(GetActiveTimelineError::Tenant)?;
+        let tenant = self
+            .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
+            .await
+            .map_err(GetActiveTimelineError::Tenant)?;
         let timeline = tenant.get_timeline(timeline_id, true)?;
         set_tracing_field_shard_id(&timeline);
         Ok(timeline)
     }
+
+    /// Get a shard's [`Tenant`] in its active state, if present.  If we don't find the shard and some
+    /// slots for this tenant are `InProgress` then we will wait.
+    /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
+    ///
+    /// `timeout` is used as a total timeout for the whole wait operation.
+    async fn get_active_tenant_with_timeout(
+        &self,
+        tenant_id: TenantId,
+        shard_selector: ShardSelector,
+        timeout: Duration,
+    ) -> Result<Arc<Tenant>, GetActiveTenantError> {
+        let wait_start = Instant::now();
+        let deadline = wait_start + timeout;
+
+        // Resolve TenantId to TenantShardId.  This is usually a quick one-shot thing, the loop is
+        // for handling the rare case that the slot we're accessing is InProgress.
+        let tenant_shard = loop {
+            let resolved = self
+                .tenant_manager
+                .resolve_attached_shard(&tenant_id, shard_selector);
+            match resolved {
+                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
+                ShardResolveResult::NotFound => {
+                    return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                        tenant_id,
+                    )));
+                }
+                ShardResolveResult::InProgress(barrier) => {
+                    // We can't authoritatively answer right now: wait for InProgress state
+                    // to end, then try again
+                    tokio::select! {
+                        _ = self.await_connection_cancelled() => {
+                            return Err(GetActiveTenantError::Cancelled)
+                        },
+                        _  = barrier.wait() => {
+                            // The barrier completed: proceed around the loop to try looking up again
+                        },
+                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                            return Err(GetActiveTenantError::WaitForActiveTimeout {
+                                latest_state: None,
+                                wait_time: timeout,
+                            });
+                        }
+                    }
+                }
+            };
+        };
+
+        tracing::debug!("Waiting for tenant to enter active state...");
+        tenant_shard
+            .wait_to_become_active(deadline.duration_since(Instant::now()))
+            .await?;
+        Ok(tenant_shard)
+    }
 }
 
 #[async_trait::async_trait]
@@ -1439,9 +1521,8 @@ where
 
         let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream_v2 ") {
-            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
+        let parts = query_string.split_whitespace().collect::<Vec<_>>();
+        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for pagestream command"
@@ -1466,9 +1547,7 @@ where
                 ctx,
             )
             .await?;
-        } else if query_string.starts_with("pagestream ") {
-            let (_, params_raw) = query_string.split_at("pagestream ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
+        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for pagestream command"
@@ -1493,10 +1572,7 @@ where
                 ctx,
             )
             .await?;
-        } else if query_string.starts_with("basebackup ") {
-            let (_, params_raw) = query_string.split_at("basebackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for basebackup command"
@@ -1514,26 +1590,23 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            let lsn = if params.len() >= 3 {
+            let lsn = if let Some(lsn_str) = params.get(2) {
                 Some(
-                    Lsn::from_str(params[2])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
                 )
             } else {
                 None
             };
 
-            let gzip = if params.len() >= 4 {
-                if params[3] == "--gzip" {
-                    true
-                } else {
+            let gzip = match params.get(3) {
+                Some(&"--gzip") => true,
+                None => false,
+                Some(third_param) => {
                     return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {}",
-                        params[3],
-                    )));
+                        "Parameter in position 3 unknown {third_param}",
+                    )))
                 }
-            } else {
-                false
             };
 
             let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1557,10 +1630,7 @@ where
             res?;
         }
         // return pair of prev_lsn and last_lsn
-        else if query_string.starts_with("get_last_record_rlsn ") {
-            let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
             if params.len() != 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for get_last_record_rlsn command"
@@ -1602,10 +1672,7 @@ where
             .await?;
         }
         // same as basebackup, but result includes relational data as well
-        else if query_string.starts_with("fullbackup ") {
-            let (_, params_raw) = query_string.split_at("fullbackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
-
+        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
             if params.len() < 2 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for fullbackup command"
@@ -1622,18 +1689,18 @@ where
                 .record("timeline_id", field::display(timeline_id));
 
             // The caller is responsible for providing correct lsn and prev_lsn.
-            let lsn = if params.len() > 2 {
+            let lsn = if let Some(lsn_str) = params.get(2) {
                 Some(
-                    Lsn::from_str(params[2])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
+                    Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
                 )
             } else {
                 None
             };
-            let prev_lsn = if params.len() > 3 {
+            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
                 Some(
-                    Lsn::from_str(params[3])
-                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
+                    Lsn::from_str(prev_lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
                 )
             } else {
                 None
@@ -1666,8 +1733,7 @@ where
             // 2. Run:
             // cat my_backup/base.tar | psql -h $PAGESERVER \
             //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let (_, params_raw) = query_string.split_at("import basebackup ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 5 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for import basebackup command"
@@ -1716,8 +1782,7 @@ where
             //
             // Files are scheduled to be persisted to remote storage, and the
             // caller should poll the http api to check when that is done.
-            let (_, params_raw) = query_string.split_at("import wal ".len());
-            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+            let params = &parts[2..];
             if params.len() != 4 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for import wal command"
@@ -1755,10 +1820,45 @@ where
             // important because psycopg2 executes "SET datestyle TO 'ISO'"
             // on connect
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("show ") {
+        } else if query_string.starts_with("lease lsn ") {
+            let params = &parts[2..];
+            if params.len() != 3 {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "invalid param number {} for lease lsn command",
+                    params.len()
+                )));
+            }
+
+            let tenant_shard_id = TenantShardId::from_str(params[0])
+                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
+            let timeline_id = TimelineId::from_str(params[1])
+                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+
+            tracing::Span::current()
+                .record("tenant_id", field::display(tenant_shard_id))
+                .record("timeline_id", field::display(timeline_id));
+
+            self.check_permission(Some(tenant_shard_id.tenant_id))?;
+
+            // The caller is responsible for providing correct lsn.
+            let lsn = Lsn::from_str(params[2])
+                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
+
+            match self
+                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
+                .await
+            {
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Err(e) => {
+                    error!("error obtaining lsn lease for {lsn}: {e:?}");
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?
+                }
+            };
+        } else if let Some(params) = parts.strip_prefix(&["show"]) {
             // show <tenant_id>
-            let (_, params_raw) = query_string.split_at("show ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
             if params.len() != 1 {
                 return Err(QueryError::Other(anyhow::anyhow!(
                     "invalid param number for config command"
@@ -1771,13 +1871,13 @@ where
 
             self.check_permission(Some(tenant_id))?;
 
-            let tenant = get_active_tenant_with_timeout(
-                tenant_id,
-                ShardSelector::Zero,
-                ACTIVE_TENANT_TIMEOUT,
-                &task_mgr::shutdown_token(),
-            )
-            .await?;
+            let tenant = self
+                .get_active_tenant_with_timeout(
+                    tenant_id,
+                    ShardSelector::Zero,
+                    ACTIVE_TENANT_TIMEOUT,
+                )
+                .await?;
             pgb.write_message_noflush(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a4215ee107..afba34c6d1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,7 +9,6 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::metrics::WAL_INGEST;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
@@ -35,12 +34,16 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
-const MAX_AUX_FILE_DELTAS: usize = 1024;
+/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
+pub const MAX_AUX_FILE_DELTAS: usize = 1024;
+
+/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
+pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
 
 #[derive(Debug)]
 pub enum LsnForTimestamp {
@@ -699,13 +702,17 @@ impl Timeline {
             .await
             .context("scan")?;
         let mut result = HashMap::new();
+        let mut sz = 0;
         for (_, v) in kv {
             let v = v.context("get value")?;
             let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
             for (fname, content) in v {
+                sz += fname.len();
+                sz += content.len();
                 result.insert(fname, content);
             }
         }
+        self.aux_file_size_estimator.on_base_backup(sz);
         Ok(result)
     }
 
@@ -714,10 +721,11 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get_switch_aux_file_policy() {
-            AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
-            AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
-            AuxFilePolicy::CrossValidation => {
+        let current_policy = self.last_aux_file_policy.load();
+        match current_policy {
+            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
+            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
+            Some(AuxFilePolicy::CrossValidation) => {
                 let v1_result = self.list_aux_files_v1(lsn, ctx).await;
                 let v2_result = self.list_aux_files_v2(lsn, ctx).await;
                 match (v1_result, v2_result) {
@@ -1465,7 +1473,40 @@ impl<'a> DatadirModification<'a> {
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let policy = self.tline.get_switch_aux_file_policy();
+        let switch_policy = self.tline.get_switch_aux_file_policy();
+
+        let policy = {
+            let current_policy = self.tline.last_aux_file_policy.load();
+            // Allowed switch path:
+            // * no aux files -> v1/v2/cross-validation
+            // * cross-validation->v2
+
+            let current_policy = if current_policy.is_none() {
+                // This path will only be hit once per tenant: we will decide the final policy in this code block.
+                // The next call to `put_file` will always have `last_aux_file_policy != None`.
+                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
+                if aux_files_key_v1.is_empty() {
+                    None
+                } else {
+                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
+                    Some(AuxFilePolicy::V1)
+                }
+            } else {
+                current_policy
+            };
+
+            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
+                self.tline.do_switch_aux_policy(switch_policy)?;
+                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
+                switch_policy
+            } else {
+                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
+                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
+                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
+            }
+        };
+
         if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
             let key = aux_file::encode_aux_file_key(path);
             // retrieve the key from the engine
@@ -1474,23 +1515,45 @@ impl<'a> DatadirModification<'a> {
                 Err(PageReconstructError::MissingKey(_)) => None,
                 Err(e) => return Err(e.into()),
             };
-            let files = if let Some(ref old_val) = old_val {
+            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
                 aux_file::decode_file_value(old_val)?
             } else {
                 Vec::new()
             };
-            let new_files = if content.is_empty() {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .collect::<Vec<_>>()
-            } else {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .chain(std::iter::once((path, content)))
-                    .collect::<Vec<_>>()
-            };
+            let mut other_files = Vec::with_capacity(files.len());
+            let mut modifying_file = None;
+            for file @ (p, content) in files {
+                if path == p {
+                    assert!(
+                        modifying_file.is_none(),
+                        "duplicated entries found for {}",
+                        path
+                    );
+                    modifying_file = Some(content);
+                } else {
+                    other_files.push(file);
+                }
+            }
+            let mut new_files = other_files;
+            match (modifying_file, content.is_empty()) {
+                (Some(old_content), false) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_update(old_content.len(), content.len());
+                    new_files.push((path, content));
+                }
+                (Some(old_content), true) => {
+                    self.tline
+                        .aux_file_size_estimator
+                        .on_remove(old_content.len());
+                    // not adding the file key to the final `new_files` vec.
+                }
+                (None, false) => {
+                    self.tline.aux_file_size_estimator.on_add(content.len());
+                    new_files.push((path, content));
+                }
+                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
+            }
             let new_val = aux_file::encode_file_value(&new_files)?;
             self.put(key, Value::Image(new_val.into()));
         }
@@ -1651,8 +1714,6 @@ impl<'a> DatadirModification<'a> {
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let mut writer = self.tline.writer().await;
 
-        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
-
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
@@ -1671,7 +1732,7 @@ impl<'a> DatadirModification<'a> {
         }
 
         if !self.pending_deletions.is_empty() {
-            writer.delete_batch(&self.pending_deletions).await?;
+            writer.delete_batch(&self.pending_deletions, ctx).await?;
             self.pending_deletions.clear();
         }
 
@@ -1692,8 +1753,6 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
-        timer.observe_duration();
-
         Ok(())
     }
 
@@ -1729,6 +1788,12 @@ impl<'a> DatadirModification<'a> {
         self.tline.get(key, lsn, ctx).await
     }
 
+    /// Only used during unit tests, force putting a key into the modification.
+    #[cfg(test)]
+    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
+        self.put(key, val);
+    }
+
     fn put(&mut self, key: Key, val: Value) {
         let values = self.pending_updates.entry(key).or_default();
         // Replace the previous value if it exists at the same lsn
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 010e56a899..540eb10ed2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,7 +20,9 @@ use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::TimelineState;
+use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
@@ -190,7 +192,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
 #[derive(Clone)]
 pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
-    pub remote_storage: Option<GenericRemoteStorage>,
+    pub remote_storage: GenericRemoteStorage,
     pub deletion_queue_client: DeletionQueueClient,
 }
 
@@ -292,7 +294,7 @@ pub struct Tenant {
     walredo_mgr: Option<Arc<WalRedoManager>>,
 
     // provides access to timeline data sitting in the remote storage
-    pub(crate) remote_storage: Option<GenericRemoteStorage>,
+    pub(crate) remote_storage: GenericRemoteStorage,
 
     // Access to global deletion queue for when this tenant wants to schedule a deletion
     deletion_queue_client: DeletionQueueClient,
@@ -528,6 +530,7 @@ impl Tenant {
         index_part: Option<IndexPart>,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
         _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
@@ -538,6 +541,10 @@ impl Tenant {
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
+            // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
+            // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
+            // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
+            last_aux_file_policy,
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -551,21 +558,26 @@ impl Tenant {
         );
 
         if let Some(index_part) = index_part.as_ref() {
+            timeline.remote_client.init_upload_queue(index_part)?;
+
             timeline
-                .remote_client
-                .as_ref()
-                .unwrap()
-                .init_upload_queue(index_part)?;
-        } else if self.remote_storage.is_some() {
+                .last_aux_file_policy
+                .store(index_part.last_aux_file_policy());
+        } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
             // By doing what we do here, the index part upload is retried.
             // If control plane retries timeline creation in the meantime, the mgmt API handler
             // for timeline creation will coalesce on the upload we queue here.
+
             // FIXME: this branch should be dead code as we no longer write local metadata.
-            let rtc = timeline.remote_client.as_ref().unwrap();
-            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+            timeline
+                .remote_client
+                .init_upload_queue_for_empty_remote(&metadata)?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_full_metadata_update(&metadata)?;
         }
 
         timeline
@@ -777,14 +789,14 @@ impl Tenant {
                     AttachType::Normal
                 };
 
-                let preload = match (&mode, &remote_storage) {
-                    (SpawnMode::Create, _) => {
+                let preload = match &mode {
+                    SpawnMode::Create => {
                         None
                     },
-                    (SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
+                    SpawnMode::Eager | SpawnMode::Lazy => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
-                            .preload(remote_storage, task_mgr::shutdown_token())
+                            .preload(&remote_storage, task_mgr::shutdown_token())
                             .await;
                         match res {
                             Ok(p) => Some(p),
@@ -794,10 +806,7 @@ impl Tenant {
                             }
                         }
                     }
-                    (_, None) => {
-                        let _preload_timer = TENANT.preload.start_timer();
-                        None
-                    }
+
                 };
 
                 // Remote preload is complete.
@@ -1021,7 +1030,7 @@ impl Tenant {
                 index_part,
                 remote_metadata,
                 TimelineResources {
-                    remote_client: Some(remote_client),
+                    remote_client,
                     deletion_queue_client: self.deletion_queue_client.clone(),
                     timeline_get_throttle: self.timeline_get_throttle.clone(),
                 },
@@ -1047,7 +1056,7 @@ impl Tenant {
                 Arc::clone(self),
                 timeline_id,
                 &index_part.metadata,
-                Some(remote_timeline_client),
+                remote_timeline_client,
                 self.deletion_queue_client.clone(),
             )
             .instrument(tracing::info_span!("timeline_delete", %timeline_id))
@@ -1139,9 +1148,7 @@ impl Tenant {
         let mut size = 0;
 
         for timeline in self.list_timelines() {
-            if let Some(remote_client) = &timeline.remote_client {
-                size += remote_client.get_remote_physical_size();
-            }
+            size += timeline.remote_client.get_remote_physical_size();
         }
 
         size
@@ -1176,12 +1183,15 @@ impl Tenant {
             None
         };
 
+        let last_aux_file_policy = index_part.last_aux_file_policy();
+
         self.timeline_init_and_sync(
             timeline_id,
             resources,
             Some(index_part),
             remote_metadata,
             ancestor,
+            last_aux_file_policy,
             ctx,
         )
         .await
@@ -1191,6 +1201,7 @@ impl Tenant {
     pub fn create_broken_tenant(
         conf: &'static PageServerConf,
         tenant_shard_id: TenantShardId,
+        remote_storage: GenericRemoteStorage,
         reason: String,
     ) -> Arc<Tenant> {
         Arc::new(Tenant::new(
@@ -1205,7 +1216,7 @@ impl Tenant {
             ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
             None,
             tenant_shard_id,
-            None,
+            remote_storage,
             DeletionQueueClient::broken(),
         ))
     }
@@ -1360,6 +1371,7 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
+            None,
         )
         .await
     }
@@ -1398,13 +1410,7 @@ impl Tenant {
         tline.freeze_and_flush().await.context("freeze_and_flush")?;
 
         // Make sure the freeze_and_flush reaches remote storage.
-        tline
-            .remote_client
-            .as_ref()
-            .unwrap()
-            .wait_completion()
-            .await
-            .unwrap();
+        tline.remote_client.wait_completion().await.unwrap();
 
         let tl = uninit_tl.finish_creation()?;
         // The non-test code would call tl.activate() here.
@@ -1470,20 +1476,19 @@ impl Tenant {
                     return Err(CreateTimelineError::Conflict);
                 }
 
-                if let Some(remote_client) = existing.remote_client.as_ref() {
-                    // Wait for uploads to complete, so that when we return Ok, the timeline
-                    // is known to be durable on remote storage. Just like we do at the end of
-                    // this function, after we have created the timeline ourselves.
-                    //
-                    // We only really care that the initial version of `index_part.json` has
-                    // been uploaded. That's enough to remember that the timeline
-                    // exists. However, there is no function to wait specifically for that so
-                    // we just wait for all in-progress uploads to finish.
-                    remote_client
-                        .wait_completion()
-                        .await
-                        .context("wait for timeline uploads to complete")?;
-                }
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                existing
+                    .remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
 
                 return Ok(existing);
             }
@@ -1559,14 +1564,14 @@ impl Tenant {
         // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
         // not send a success to the caller until it is.  The same applies to handling retries,
         // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
-        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
-            let kind = ancestor_timeline_id
-                .map(|_| "branched")
-                .unwrap_or("bootstrapped");
-            remote_client.wait_completion().await.with_context(|| {
-                format!("wait for {} timeline initial uploads to complete", kind)
-            })?;
-        }
+        let kind = ancestor_timeline_id
+            .map(|_| "branched")
+            .unwrap_or("bootstrapped");
+        loaded_timeline
+            .remote_client
+            .wait_completion()
+            .await
+            .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
 
         loaded_timeline.activate(self.clone(), broker_client, None, ctx);
 
@@ -2161,32 +2166,26 @@ impl Tenant {
     ) -> anyhow::Result<()> {
         let timelines = self.timelines.lock().unwrap().clone();
         for timeline in timelines.values() {
-            let Some(tl_client) = &timeline.remote_client else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
-            let Some(remote_storage) = &self.remote_storage else {
-                anyhow::bail!("Remote storage is mandatory");
-            };
-
             // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
             // to ensure that they do not start a split if currently in the process of doing these.
 
             // Upload an index from the parent: this is partly to provide freshness for the
             // child tenants that will copy it, and partly for general ease-of-debugging: there will
             // always be a parent shard index in the same generation as we wrote the child shard index.
-            tl_client.schedule_index_upload_for_file_changes()?;
-            tl_client.wait_completion().await?;
+            timeline
+                .remote_client
+                .schedule_index_upload_for_file_changes()?;
+            timeline.remote_client.wait_completion().await?;
 
             // Shut down the timeline's remote client: this means that the indices we write
             // for child shards will not be invalidated by the parent shard deleting layers.
-            tl_client.shutdown().await;
+            timeline.remote_client.shutdown().await;
 
             // Download methods can still be used after shutdown, as they don't flow through the remote client's
             // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
             // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
             // we use here really is the remotely persistent one).
-            let result = tl_client
+            let result = timeline.remote_client
                 .download_index_file(&self.cancel)
                 .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
                 .await?;
@@ -2199,7 +2198,7 @@ impl Tenant {
 
             for child_shard in child_shards {
                 upload_index_part(
-                    remote_storage,
+                    &self.remote_storage,
                     child_shard,
                     &timeline.timeline_id,
                     self.generation,
@@ -2212,6 +2211,31 @@ impl Tenant {
 
         Ok(())
     }
+
+    pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
+        let mut result = TopTenantShardItem {
+            id: self.tenant_shard_id,
+            resident_size: 0,
+            physical_size: 0,
+            max_logical_size: 0,
+        };
+
+        for timeline in self.timelines.lock().unwrap().values() {
+            result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
+
+            result.physical_size += timeline
+                .remote_client
+                .metrics
+                .remote_physical_size_gauge
+                .get();
+            result.max_logical_size = std::cmp::max(
+                result.max_logical_size,
+                timeline.metrics.current_logical_size_gauge.get(),
+            );
+        }
+
+        result
+    }
 }
 
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2431,6 +2455,7 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -2459,6 +2484,7 @@ impl Tenant {
             resources,
             pg_version,
             state,
+            last_aux_file_policy,
             self.cancel.child_token(),
         );
 
@@ -2475,7 +2501,7 @@ impl Tenant {
         shard_identity: ShardIdentity,
         walredo_mgr: Option<Arc<WalRedoManager>>,
         tenant_shard_id: TenantShardId,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
     ) -> Tenant {
         let (state, mut rx) = watch::channel(state);
@@ -2800,7 +2826,7 @@ impl Tenant {
         // See comments in [`Tenant::branch_timeline`] for more information about why branch
         // creation task can run concurrently with timeline's GC iteration.
         for timeline in gc_timelines {
-            if task_mgr::is_shutdown_requested() || cancel.is_cancelled() {
+            if cancel.is_cancelled() {
                 // We were requested to shut down. Stop and return with the progress we
                 // made.
                 break;
@@ -3109,6 +3135,7 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
+                src_timeline.last_aux_file_policy.load(),
             )
             .await?;
 
@@ -3119,11 +3146,10 @@ impl Tenant {
         // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
         // could get incorrect information and remove more layers, than needed.
         // See also https://github.com/neondatabase/neon/issues/3865
-        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
-            remote_client
-                .schedule_index_upload_for_full_metadata_update(&metadata)
-                .context("branch initial metadata upload")?;
-        }
+        new_timeline
+            .remote_client
+            .schedule_index_upload_for_full_metadata_update(&metadata)
+            .context("branch initial metadata upload")?;
 
         Ok(new_timeline)
     }
@@ -3155,11 +3181,6 @@ impl Tenant {
         pgdata_path: &Utf8PathBuf,
         timeline_id: &TimelineId,
     ) -> anyhow::Result<()> {
-        let Some(storage) = &self.remote_storage else {
-            // No remote storage?  No upload.
-            return Ok(());
-        };
-
         let temp_path = timelines_path.join(format!(
             "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
         ));
@@ -3183,7 +3204,7 @@ impl Tenant {
         backoff::retry(
             || async {
                 self::remote_timeline_client::upload_initdb_dir(
-                    storage,
+                    &self.remote_storage,
                     &self.tenant_shard_id.tenant_id,
                     timeline_id,
                     pgdata_zstd.try_clone().await?,
@@ -3240,9 +3261,6 @@ impl Tenant {
             }
         }
         if let Some(existing_initdb_timeline_id) = load_existing_initdb {
-            let Some(storage) = &self.remote_storage else {
-                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
-            };
             if existing_initdb_timeline_id != timeline_id {
                 let source_path = &remote_initdb_archive_path(
                     &self.tenant_shard_id.tenant_id,
@@ -3252,7 +3270,7 @@ impl Tenant {
                     &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
 
                 // if this fails, it will get retried by retried control plane requests
-                storage
+                self.remote_storage
                     .copy_object(source_path, dest_path, &self.cancel)
                     .await
                     .context("copy initdb tar")?;
@@ -3260,7 +3278,7 @@ impl Tenant {
             let (initdb_tar_zst_path, initdb_tar_zst) =
                 self::remote_timeline_client::download_initdb_tar_zst(
                     self.conf,
-                    storage,
+                    &self.remote_storage,
                     &self.tenant_shard_id,
                     &existing_initdb_timeline_id,
                     &self.cancel,
@@ -3311,6 +3329,7 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
+                None,
             )
             .await?;
 
@@ -3355,20 +3374,14 @@ impl Tenant {
 
     /// Call this before constructing a timeline, to build its required structures
     fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
-        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.deletion_queue_client.clone(),
-                self.conf,
-                self.tenant_shard_id,
-                timeline_id,
-                self.generation,
-            );
-            Some(remote_client)
-        } else {
-            None
-        };
-
+        let remote_client = RemoteTimelineClient::new(
+            self.remote_storage.clone(),
+            self.deletion_queue_client.clone(),
+            self.conf,
+            self.tenant_shard_id,
+            timeline_id,
+            self.generation,
+        );
         TimelineResources {
             remote_client,
             deletion_queue_client: self.deletion_queue_client.clone(),
@@ -3388,13 +3401,14 @@ impl Tenant {
         create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<UninitializedTimeline> {
         let tenant_shard_id = self.tenant_shard_id;
 
         let resources = self.build_timeline_resources(new_timeline_id);
-        if let Some(remote_client) = &resources.remote_client {
-            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
-        }
+        resources
+            .remote_client
+            .init_upload_queue_for_empty_remote(new_metadata)?;
 
         let timeline_struct = self
             .create_timeline_struct(
@@ -3403,6 +3417,7 @@ impl Tenant {
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
+                last_aux_file_policy,
             )
             .context("Failed to create timeline data structure")?;
 
@@ -3562,9 +3577,7 @@ impl Tenant {
             tracing::info!(timeline_id=%timeline.timeline_id, "Flushing...");
             timeline.freeze_and_flush().await?;
             tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads...");
-            if let Some(client) = &timeline.remote_client {
-                client.wait_completion().await?;
-            }
+            timeline.remote_client.wait_completion().await?;
 
             Ok(())
         }
@@ -3878,7 +3891,7 @@ pub(crate) mod harness {
                 ShardIdentity::unsharded(),
                 Some(walredo_mgr),
                 self.tenant_shard_id,
-                Some(self.remote_storage.clone()),
+                self.remote_storage.clone(),
                 self.deletion_queue.new_client(),
             ));
 
@@ -3951,18 +3964,20 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
+    use crate::pgdatadir_mapping::AuxFilesDirectory;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
     use crate::DEFAULT_PG_VERSION;
-    use bytes::BytesMut;
+    use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
+    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
+    use utils::bin_ser::BeSer;
 
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4764,7 +4779,12 @@ mod tests {
             info!("Doing vectored read on {:?}", read);
 
             let vectored_res = tline
-                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
+                .get_vectored_impl(
+                    read.clone(),
+                    reads_lsn,
+                    &mut ValuesReconstructState::new(),
+                    &ctx,
+                )
                 .await;
             tline
                 .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
@@ -4813,7 +4833,7 @@ mod tests {
             .get_vectored_impl(
                 aux_keyspace.clone(),
                 read_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                 &ctx,
             )
             .await;
@@ -4958,7 +4978,7 @@ mod tests {
             .get_vectored_impl(
                 read.clone(),
                 current_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                 &ctx,
             )
             .await?;
@@ -5093,7 +5113,7 @@ mod tests {
                         ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                     },
                     query_lsn,
-                    ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(),
                     &ctx,
                 )
                 .await;
@@ -5149,7 +5169,9 @@ mod tests {
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
         let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5506,7 +5528,9 @@ mod tests {
         compaction_algorithm: CompactionAlgorithm,
     ) -> anyhow::Result<()> {
         let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
         let (tenant, ctx) = harness.load().await;
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -5534,7 +5558,7 @@ mod tests {
             .await?;
 
         const NUM_KEYS: usize = 1000;
-        const STEP: usize = 100; // random update + scan base_key + idx * STEP
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP
 
         let cancel = CancellationToken::new();
 
@@ -5567,7 +5591,7 @@ mod tests {
 
         let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
 
-        for _ in 0..10 {
+        for iter in 0..=10 {
             // Read all the blocks
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = (blknum * STEP) as u32;
@@ -5582,7 +5606,7 @@ mod tests {
                 .get_vectored_impl(
                     keyspace.clone(),
                     lsn,
-                    ValuesReconstructState::default(),
+                    &mut ValuesReconstructState::default(),
                     &ctx,
                 )
                 .await?
@@ -5618,14 +5642,858 @@ mod tests {
                 updated[blknum] = lsn;
             }
 
-            // Perform a cycle of flush, compact, and GC
+            // Perform two cycles of flush, compact, and GC
+            for round in 0..2 {
+                tline.freeze_and_flush().await?;
+                tline
+                    .compact(
+                        &cancel,
+                        if iter % 5 == 0 && round == 0 {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        } else {
+                            EnumSet::empty()
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                tenant
+                    .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                    .await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        for _ in 0..20 {
+            lsn = Lsn(lsn.0 + 0x10);
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", 0, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+            tline.freeze_and_flush().await?; // force create a delta layer
+        }
+
+        let before_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+
+        let after_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
+
+        assert_eq!(
+            tline.get(test_key, lsn, &ctx).await?,
+            test_img(&format!("{} at {}", 0, lsn))
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_branch_copies_dirty_aux_file_flag() {
+        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
+
+        // the default aux file policy to switch is v1 if not set by the admins
+        assert_eq!(
+            harness.tenant_conf.switch_aux_file_policy,
+            AuxFilePolicy::V1
+        );
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // no aux file is written at this point, so the persistent flag should be unset
+        assert_eq!(tline.last_aux_file_policy.load(), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1"
+        );
+
+        // we can read everything from the storage
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep v1 storage format when new files are written"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        // child copies the last flag even if that is not on remote storage yet
+        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1));
+
+        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+        assert_eq!(files.get("pg_logical/mappings/test2"), None);
+
+        // even if we crash here without flushing parent timeline with it's new
+        // last_aux_file_policy we are safe, because child was never meant to access ancestor's
+        // files. the ancestor can even switch back to V1 because of a migration safely.
+    }
+
+    #[tokio::test]
+    async fn aux_file_policy_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // there is no tenant manager to pass the configuration through, so lets mimic it
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V2,
+            "wanted state has been updated"
+        );
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::CrossValidation),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // we can still read the auxfile v1 before we ingest anything new
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first")),
+            "cross validation writes to both v1 and v2 so this should be available in v2"
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+
+        // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V1),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"third", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.get_switch_aux_file_policy(),
+            AuxFilePolicy::V1,
+            "wanted state has been updated again, even if invalid request"
+        );
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "ingesting a file should apply the wanted switch state when applicable"
+        );
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+
+        // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
+        tenant.set_new_location_config(
+            AttachedTenantConf::try_from(LocationConf::attached_single(
+                TenantConfOpt {
+                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
+                    ..Default::default()
+                },
+                tenant.generation,
+                &pageserver_api::models::ShardParameters::default(),
+            ))
+            .unwrap(),
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test3", b"last", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
+
+        assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
+
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"third"))
+        );
+        assert_eq!(
+            files.get("pg_logical/mappings/test3"),
+            Some(&bytes::Bytes::from_static(b"last"))
+        );
+    }
+
+    #[tokio::test]
+    async fn aux_file_policy_force_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // lose all data from v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // read data ingested in v2
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+        // lose all data from v1
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+    }
+
+    #[tokio::test]
+    async fn aux_file_policy_auto_detect() {
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+                files: vec![(
+                    "test_file".to_string(),
+                    Bytes::copy_from_slice(b"test_file"),
+                )]
+                .into_iter()
+                .collect(),
+            })
+            .unwrap();
+            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep using v1 because there are aux files writting with v1"
+        );
+
+        // we can still read the auxfile v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("test_file"),
+            Some(&bytes::Bytes::from_static(b"test_file"))
+        );
+    }
+
+    #[tokio::test]
+    async fn test_metadata_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        const NUM_KEYS: usize = 1000;
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let mut test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        async fn scan_with_statistics(
+            tline: &Timeline,
+            keyspace: &KeySpace,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
+            let mut reconstruct_state = ValuesReconstructState::default();
+            let res = tline
+                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                .await?;
+            Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for blknum in 0..NUM_KEYS {
+            lsn = Lsn(lsn.0 + 0x10);
+            test_key.field6 = (blknum * STEP) as u32;
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+        }
+
+        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
+
+        for iter in 1..=10 {
+            for _ in 0..NUM_KEYS {
+                lsn = Lsn(lsn.0 + 0x10);
+                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                test_key.field6 = (blknum * STEP) as u32;
+                let mut writer = tline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+            }
+
             tline.freeze_and_flush().await?;
-            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+
+            if iter % 5 == 0 {
+                let (_, before_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                tline
+                    .compact(
+                        &cancel,
+                        {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                let (_, after_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
+                // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
+                assert!(
+                    after_delta_file_accessed <= 2,
+                    "after_delta_file_accessed={after_delta_file_accessed}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create a image layer
+        }
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("data key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?; // this will create a delta
+
+            {
+                // update the partitioning to include the test key space, otherwise they
+                // will be dropped by image layer creation
+                let mut guard = child.partitioning.lock().await;
+                let ((partitioning, _), partition_lsn) = &mut *guard;
+                partitioning
+                    .parts
+                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
+                *partition_lsn = lsn;
+            }
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
+            .await
+            .unwrap_err()
+            .is_missing_key_error());
+        assert!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("data key 2"))
+        );
+        assert!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        base_key_child.field1 = AUX_KEY_PREFIX;
+        base_key_nonexist.field1 = AUX_KEY_PREFIX;
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    base_key,
+                    lsn,
+                    &Value::Image(test_img("metadata key 1")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create an image layer
+
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
             tenant
                 .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                 .await?;
         }
 
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("metadata key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?;
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("metadata key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 1dc451f5c9..24b4e4f3ea 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -299,7 +299,7 @@ mod tests {
         // Write part (in block to drop the file)
         let mut offsets = Vec::new();
         {
-            let file = VirtualFile::create(pathbuf.as_path()).await?;
+            let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
                 let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
@@ -314,7 +314,7 @@ mod tests {
             wtr.flush_buffer(&ctx).await?;
         }
 
-        let file = VirtualFile::open(pathbuf.as_path()).await?;
+        let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
         let rdr = BlockCursor::new(rdr);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 37c84be342..92928116c1 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -102,7 +102,7 @@ impl<'a> BlockReaderRef<'a> {
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
-            VirtualFile(r) => r.read_blk(blknum).await,
+            VirtualFile(r) => r.read_blk(blknum, ctx).await,
         }
     }
 }
@@ -177,10 +177,11 @@ impl<'a> FileBlockReader<'a> {
         &self,
         buf: PageWriteGuard<'static>,
         blkno: u32,
+        ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, std::io::Error> {
         assert!(buf.len() == PAGE_SZ);
         self.file
-            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64)
+            .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx)
             .await
     }
     /// Read a block.
@@ -206,7 +207,7 @@ impl<'a> FileBlockReader<'a> {
             ReadBufResult::Found(guard) => Ok(guard.into()),
             ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
-                let write_guard = self.fill_buffer(write_guard, blknum).await?;
+                let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?;
                 Ok(write_guard.mark_valid().into())
             }
         }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index a743ce3c16..342d705954 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,6 +11,7 @@
 use anyhow::bail;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
+use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -320,7 +321,7 @@ pub struct TenantConf {
     pub compaction_period: Duration,
     // Level0 delta layer threshold for compaction.
     pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithm,
+    pub compaction_algorithm: CompactionAlgorithmSettings,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is #of bytes of WAL.
@@ -373,6 +374,8 @@ pub struct TenantConf {
 
     /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
     /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
+    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
+    /// file is written.
     pub switch_aux_file_policy: AuxFilePolicy,
 }
 
@@ -404,7 +407,7 @@ pub struct TenantConfOpt {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
 
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(default)]
@@ -495,7 +498,9 @@ impl TenantConfOpt {
                 .unwrap_or(global_conf.compaction_threshold),
             compaction_algorithm: self
                 .compaction_algorithm
-                .unwrap_or(global_conf.compaction_algorithm),
+                .as_ref()
+                .unwrap_or(&global_conf.compaction_algorithm)
+                .clone(),
             gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
             gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
             image_creation_threshold: self
@@ -548,7 +553,9 @@ impl Default for TenantConf {
             compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                 .expect("cannot parse default compaction period"),
             compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
+            compaction_algorithm: CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
             gc_horizon: DEFAULT_GC_HORIZON,
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
@@ -574,7 +581,7 @@ impl Default for TenantConf {
             lazy_slru_download: false,
             timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::V1,
+            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
         }
     }
 }
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 2e5259bfe2..3173a33dad 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -181,25 +181,23 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
 
 async fn remove_tenant_remote_delete_mark(
     conf: &PageServerConf,
-    remote_storage: Option<&GenericRemoteStorage>,
+    remote_storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
     cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
-    if let Some(remote_storage) = remote_storage {
-        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-        backoff::retry(
-            || async { remote_storage.delete(&path, cancel).await },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_tenant_remote_delete_mark",
-            cancel,
-        )
-        .await
-        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-        .and_then(|x| x)
-        .context("remove_tenant_remote_delete_mark")?;
-    }
+    let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
+    backoff::retry(
+        || async { remote_storage.delete(&path, cancel).await },
+        TimeoutOrCancel::caused_by_cancel,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "remove_tenant_remote_delete_mark",
+        cancel,
+    )
+    .await
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+    .and_then(|x| x)
+    .context("remove_tenant_remote_delete_mark")?;
     Ok(())
 }
 
@@ -297,7 +295,7 @@ impl DeleteTenantFlow {
     #[instrument(skip_all)]
     pub(crate) async fn run(
         conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
         cancel: &CancellationToken,
@@ -308,9 +306,7 @@ impl DeleteTenantFlow {
 
         let mut guard = Self::prepare(&tenant).await?;
 
-        if let Err(e) =
-            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
-        {
+        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
             tenant.set_broken(format!("{e:#}")).await;
             return Err(e);
         }
@@ -327,7 +323,7 @@ impl DeleteTenantFlow {
     async fn run_inner(
         guard: &mut OwnedMutexGuard<Self>,
         conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
+        remote_storage: &GenericRemoteStorage,
         tenant: &Tenant,
         cancel: &CancellationToken,
     ) -> Result<(), DeleteTenantError> {
@@ -339,14 +335,9 @@ impl DeleteTenantFlow {
             ))?
         });
 
-        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
-        // Though sounds scary, different mark name?
-        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
-        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-                .await
-                .context("remote_mark")?
-        }
+        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+            .await
+            .context("remote_mark")?;
 
         fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
             Err(anyhow::anyhow!(
@@ -483,7 +474,7 @@ impl DeleteTenantFlow {
     fn schedule_background(
         guard: OwnedMutexGuard<Self>,
         conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: Arc<Tenant>,
     ) {
@@ -512,7 +503,7 @@ impl DeleteTenantFlow {
     async fn background(
         mut guard: OwnedMutexGuard<Self>,
         conf: &PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
+        remote_storage: GenericRemoteStorage,
         tenants: &'static std::sync::RwLock<TenantsMap>,
         tenant: &Arc<Tenant>,
     ) -> Result<(), DeleteTenantError> {
@@ -551,7 +542,7 @@ impl DeleteTenantFlow {
 
         remove_tenant_remote_delete_mark(
             conf,
-            remote_storage.as_ref(),
+            &remote_storage,
             &tenant.tenant_shard_id,
             &task_mgr::shutdown_token(),
         )
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 8b815a1885..79cc7bf153 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -28,6 +28,7 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
+        ctx: &RequestContext,
     ) -> Result<EphemeralFile, io::Error> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
         let filename_disambiguator =
@@ -45,6 +46,7 @@ impl EphemeralFile {
                 .read(true)
                 .write(true)
                 .create(true),
+            ctx,
         )
         .await?;
 
@@ -153,7 +155,7 @@ mod tests {
     async fn test_ephemeral_blobs() -> Result<(), io::Error> {
         let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?;
 
         let pos_foo = file.write_blob(b"foo", &ctx).await?;
         assert_eq!(
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 42def8858e..276ac87064 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -78,7 +78,7 @@ impl RW {
                     page_cache::ReadBufResult::NotFound(write_guard) => {
                         let write_guard = writer
                             .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
                             .await?;
                         let read_guard = write_guard.mark_valid();
                         return Ok(BlockLease::PageReadGuard(read_guard));
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 8ba0775120..fc71ea7642 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -214,12 +214,12 @@ impl TimelineMetadata {
         self.body.ancestor_timeline = Some(*timeline);
     }
 
-    pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
+    pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
         if let Some(ancestor) = self.body.ancestor_timeline {
-            assert_eq!(ancestor, *timeline);
+            assert_eq!(ancestor, branchpoint.0);
         }
         if self.body.ancestor_lsn != Lsn(0) {
-            assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
+            assert_eq!(self.body.ancestor_lsn, branchpoint.1);
         }
         self.body.ancestor_timeline = None;
         self.body.ancestor_lsn = Lsn(0);
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 6be66e99ad..89fdf31849 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,7 +7,7 @@ use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+    ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
@@ -16,10 +16,9 @@ use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use sysinfo::SystemExt;
 use tokio::fs;
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};
 
 use anyhow::Context;
 use once_cell::sync::Lazy;
@@ -47,7 +46,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -119,6 +118,7 @@ pub(crate) enum TenantsMapRemoveResult {
 
 /// When resolving a TenantId to a shard, we may be looking for the 0th
 /// shard, or we might be looking for whichever shard holds a particular page.
+#[derive(Copy, Clone)]
 pub(crate) enum ShardSelector {
     /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
     /// ignore it.
@@ -127,6 +127,8 @@ pub(crate) enum ShardSelector {
     First,
     /// Pick the shard that holds this key
     Page(Key),
+    /// The shard ID is known: pick the given shard
+    Known(ShardIndex),
 }
 
 /// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -169,6 +171,14 @@ impl TenantStartupMode {
     }
 }
 
+/// Result type for looking up a TenantId to a specific shard
+pub(crate) enum ShardResolveResult {
+    NotFound,
+    Found(Arc<Tenant>),
+    // Wait for this barrrier, then query again
+    InProgress(utils::completion::Barrier),
+}
+
 impl TenantsMap {
     /// Convenience function for typical usage, where we want to get a `Tenant` object, for
     /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -182,51 +192,6 @@ impl TenantsMap {
         }
     }
 
-    /// A page service client sends a TenantId, and to look up the correct Tenant we must
-    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_attached_shard(
-        &self,
-        tenant_id: &TenantId,
-        selector: ShardSelector,
-    ) -> Option<TenantShardId> {
-        let mut want_shard = None;
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    // Ignore all slots that don't contain an attached tenant
-                    let tenant = match &slot.1 {
-                        TenantSlot::Attached(t) => t,
-                        _ => continue,
-                    };
-
-                    match selector {
-                        ShardSelector::First => return Some(*slot.0),
-                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return Some(*slot.0)
-                        }
-                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                            }
-
-                            if Some(tenant.shard_identity.number) == want_shard {
-                                return Some(*slot.0);
-                            }
-                        }
-                        _ => continue,
-                    }
-                }
-
-                // Fall through: we didn't find an acceptable shard
-                None
-            }
-        }
-    }
-
     /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
     ///
     /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -391,22 +356,17 @@ async fn init_load_generations(
     // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
     // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
     // are processed, even though we don't block on recovery completing here.
-    //
-    // Must only do this if remote storage is enabled, otherwise deletion queue
-    // is not running and channel push will fail.
-    if resources.remote_storage.is_some() {
-        let attached_tenants = generations
-            .iter()
-            .flat_map(|(id, start_mode)| {
-                match start_mode {
-                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
-                    TenantStartupMode::Secondary => None,
-                }
-                .map(|gen| (*id, *gen))
-            })
-            .collect();
-        resources.deletion_queue_client.recover(attached_tenants)?;
-    }
+    let attached_tenants = generations
+        .iter()
+        .flat_map(|(id, start_mode)| {
+            match start_mode {
+                TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                TenantStartupMode::Secondary => None,
+            }
+            .map(|gen| (*id, *gen))
+        })
+        .collect();
+    resources.deletion_queue_client.recover(attached_tenants)?;
 
     Ok(Some(generations))
 }
@@ -460,53 +420,6 @@ fn load_tenant_config(
         }
     };
 
-    // Clean up legacy `metadata` files.
-    // Doing it here because every single tenant directory is visited here.
-    // In any later code, there's different treatment of tenant dirs
-    // ... depending on whether the tenant is in re-attach response or not
-    // ... epending on whether the tenant is ignored or not
-    assert_eq!(
-        &conf.tenant_path(&tenant_shard_id),
-        &tenant_dir_path,
-        "later use of conf....path() methods would be dubious"
-    );
-    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
-        Ok(iter) => {
-            let mut timelines = Vec::new();
-            for res in iter {
-                let p = res?;
-                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
-                    // skip any entries that aren't TimelineId, such as
-                    // - *.___temp dirs
-                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
-                    continue;
-                };
-                timelines.push(timeline_id);
-            }
-            timelines
-        }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
-        Err(e) => return Err(anyhow::anyhow!(e)),
-    };
-    for timeline_id in timelines {
-        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
-        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
-        match std::fs::remove_file(&metadata_path) {
-            Ok(()) => {
-                crashsafe::fsync(timeline_path)
-                    .context("fsync timeline dir after removing legacy metadata file")?;
-                info!("removed legacy metadata file at {metadata_path}");
-            }
-            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                // something removed the file earlier, or it was never there
-                // We don't care, this software version doesn't write it again, so, we're good.
-            }
-            Err(e) => {
-                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
-            }
-        }
-    }
-
     let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
     if tenant_ignore_mark_file.exists() {
         info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
@@ -611,6 +524,7 @@ pub async fn init_tenant_mgr(
                     TenantSlot::Attached(Tenant::create_broken_tenant(
                         conf,
                         tenant_shard_id,
+                        resources.remote_storage.clone(),
                         format!("{}", e),
                     )),
                 );
@@ -803,6 +717,7 @@ fn tenant_spawn(
         "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
     );
 
+    let remote_storage = resources.remote_storage.clone();
     let tenant = match Tenant::spawn(
         conf,
         tenant_shard_id,
@@ -817,7 +732,7 @@ fn tenant_spawn(
         Ok(tenant) => tenant,
         Err(e) => {
             error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
+            Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
         }
     };
 
@@ -2103,6 +2018,77 @@ impl TenantManager {
 
         Ok(reparented)
     }
+
+    /// A page service client sends a TenantId, and to look up the correct Tenant we must
+    /// resolve this to a fully qualified TenantShardId.
+    ///
+    /// During shard splits: we shall see parent shards in InProgress state and skip them, and
+    /// instead match on child shards which should appear in Attached state.  Very early in a shard
+    /// split, or in other cases where a shard is InProgress, we will return our own InProgress result
+    /// to instruct the caller to wait for that to finish before querying again.
+    pub(crate) fn resolve_attached_shard(
+        &self,
+        tenant_id: &TenantId,
+        selector: ShardSelector,
+    ) -> ShardResolveResult {
+        let tenants = self.tenants.read().unwrap();
+        let mut want_shard = None;
+        let mut any_in_progress = None;
+
+        match &*tenants {
+            TenantsMap::Initializing => ShardResolveResult::NotFound,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        TenantSlot::InProgress(barrier) => {
+                            // We might still find a usable shard, but in case we don't, remember that
+                            // we saw at least one InProgress slot, so that we can distinguish this case
+                            // from a simple NotFound in our return value.
+                            any_in_progress = Some(barrier.clone());
+                            continue;
+                        }
+                        _ => continue,
+                    };
+
+                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
+                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
+                            return ShardResolveResult::Found(tenant.clone())
+                        }
+                        ShardSelector::Page(key) => {
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
+
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return ShardResolveResult::Found(tenant.clone());
+                            }
+                        }
+                        ShardSelector::Known(shard)
+                            if tenant.shard_identity.shard_index() == shard =>
+                        {
+                            return ShardResolveResult::Found(tenant.clone());
+                        }
+                        _ => continue,
+                    }
+                }
+
+                // Fall through: we didn't find a slot that was in Attached state & matched our selector.  If
+                // we found one or more InProgress slot, indicate to caller that they should retry later.  Otherwise
+                // this requested shard simply isn't found.
+                if let Some(barrier) = any_in_progress {
+                    ShardResolveResult::InProgress(barrier)
+                } else {
+                    ShardResolveResult::NotFound
+                }
+            }
+        }
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -2151,105 +2137,6 @@ pub(crate) enum GetActiveTenantError {
     Broken(String),
 }
 
-/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
-/// state, then wait for up to `timeout`.  If the [`Tenant`] is not currently in [`TenantState::Active`],
-/// then wait for up to `timeout` (minus however long we waited for the slot).
-pub(crate) async fn get_active_tenant_with_timeout(
-    tenant_id: TenantId,
-    shard_selector: ShardSelector,
-    timeout: Duration,
-    cancel: &CancellationToken,
-) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    enum WaitFor {
-        Barrier(utils::completion::Barrier),
-        Tenant(Arc<Tenant>),
-    }
-
-    let wait_start = Instant::now();
-    let deadline = wait_start + timeout;
-
-    let (wait_for, tenant_shard_id) = {
-        let locked = TENANTS.read().unwrap();
-
-        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked
-            .resolve_attached_shard(&tenant_id, shard_selector)
-            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                tenant_id,
-            )))?;
-
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-            .map_err(GetTenantError::MapState)?;
-        match peek_slot {
-            Some(TenantSlot::Attached(tenant)) => {
-                match tenant.current_state() {
-                    TenantState::Active => {
-                        // Fast path: we don't need to do any async waiting.
-                        return Ok(tenant.clone());
-                    }
-                    _ => {
-                        tenant.activate_now();
-                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
-                    }
-                }
-            }
-            Some(TenantSlot::Secondary(_)) => {
-                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                    tenant_shard_id,
-                )))
-            }
-            Some(TenantSlot::InProgress(barrier)) => {
-                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
-            }
-            None => {
-                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                    tenant_id,
-                )))
-            }
-        }
-    };
-
-    let tenant = match wait_for {
-        WaitFor::Barrier(barrier) => {
-            tracing::debug!("Waiting for tenant InProgress state to pass...");
-            timeout_cancellable(
-                deadline.duration_since(Instant::now()),
-                cancel,
-                barrier.wait(),
-            )
-            .await
-            .map_err(|e| match e {
-                TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state: None,
-                    wait_time: wait_start.elapsed(),
-                },
-                TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled,
-            })?;
-            {
-                let locked = TENANTS.read().unwrap();
-                let peek_slot =
-                    tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
-                        .map_err(GetTenantError::MapState)?;
-                match peek_slot {
-                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
-                    _ => {
-                        return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                            tenant_shard_id,
-                        )))
-                    }
-                }
-            }
-        }
-        WaitFor::Tenant(tenant) => tenant,
-    };
-
-    tracing::debug!("Waiting for tenant to enter active state...");
-    tenant
-        .wait_to_become_active(deadline.duration_since(Instant::now()))
-        .await?;
-    Ok(tenant)
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
     #[error("Tenant {0}")]
@@ -2276,7 +2163,7 @@ pub(crate) async fn load_tenant(
     tenant_id: TenantId,
     generation: Generation,
     broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    remote_storage: GenericRemoteStorage,
     deletion_queue_client: DeletionQueueClient,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -2880,86 +2767,73 @@ use {
     utils::http::error::ApiError,
 };
 
-pub(crate) fn immediate_gc(
+#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
+pub(crate) async fn immediate_gc(
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
     cancel: CancellationToken,
     ctx: &RequestContext,
-) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
-    let guard = TENANTS.read().unwrap();
-
-    let tenant = guard
-        .get(&tenant_shard_id)
-        .cloned()
-        .with_context(|| format!("tenant {tenant_shard_id}"))
-        .map_err(|e| ApiError::NotFound(e.into()))?;
+) -> Result<GcResult, ApiError> {
+    let tenant = {
+        let guard = TENANTS.read().unwrap();
+        guard
+            .get(&tenant_shard_id)
+            .cloned()
+            .with_context(|| format!("tenant {tenant_shard_id}"))
+            .map_err(|e| ApiError::NotFound(e.into()))?
+    };
 
     let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
     // Use tenant's pitr setting
     let pitr = tenant.get_pitr_interval();
 
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
     // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
-    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
-    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
+    let ctx: RequestContext =
+        ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
 
-    // TODO: spawning is redundant now, need to hold the gate
-    task_mgr::spawn(
-        &tokio::runtime::Handle::current(),
-        TaskKind::GarbageCollector,
-        Some(tenant_shard_id),
-        Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
-        false,
-        async move {
-            fail::fail_point!("immediate_gc_task_pre");
+    let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
 
-            #[allow(unused_mut)]
-            let mut result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .await;
-                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
-                // better once the types support it.
+    fail::fail_point!("immediate_gc_task_pre");
 
-            #[cfg(feature = "testing")]
-            {
-                // we need to synchronize with drop completion for python tests without polling for
-                // log messages
-                if let Ok(result) = result.as_mut() {
-                    let mut js = tokio::task::JoinSet::new();
-                    for layer in std::mem::take(&mut result.doomed_layers) {
-                        js.spawn(layer.wait_drop());
-                    }
-                    tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
-                    while let Some(res) = js.join_next().await {
-                        res.expect("wait_drop should not panic");
-                    }
-                }
+    #[allow(unused_mut)]
+    let mut result = tenant
+        .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+        .await;
+    // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
+    // better once the types support it.
 
-                let timeline = tenant.get_timeline(timeline_id, false).ok();
-                let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
-
-                if let Some(rtc) = rtc {
-                    // layer drops schedule actions on remote timeline client to actually do the
-                    // deletions; don't care about the shutdown error, just exit fast
-                    drop(rtc.wait_completion().await);
-                }
+    #[cfg(feature = "testing")]
+    {
+        // we need to synchronize with drop completion for python tests without polling for
+        // log messages
+        if let Ok(result) = result.as_mut() {
+            let mut js = tokio::task::JoinSet::new();
+            for layer in std::mem::take(&mut result.doomed_layers) {
+                js.spawn(layer.wait_drop());
             }
-
-            match task_done.send(result) {
-                Ok(_) => (),
-                Err(result) => error!("failed to send gc result: {result:?}"),
+            tracing::info!(
+                total = js.len(),
+                "starting to wait for the gc'd layers to be dropped"
+            );
+            while let Some(res) = js.join_next().await {
+                res.expect("wait_drop should not panic");
             }
-            Ok(())
         }
-        .instrument(span)
-    );
 
-    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
-    drop(guard);
+        let timeline = tenant.get_timeline(timeline_id, false).ok();
+        let rtc = timeline.as_ref().map(|x| &x.remote_client);
 
-    Ok(wait_task_done)
+        if let Some(rtc) = rtc {
+            // layer drops schedule actions on remote timeline client to actually do the
+            // deletions; don't care about the shutdown error, just exit fast
+            drop(rtc.wait_completion().await);
+        }
+    }
+
+    result.map_err(ApiError::InternalServerError)
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 356a0dc51c..d3adae6841 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -189,6 +189,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -240,7 +241,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;
 
 use super::metadata::MetadataUpdate;
-use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
+use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
 
@@ -317,7 +318,7 @@ pub struct RemoteTimelineClient {
 
     upload_queue: Mutex<UploadQueue>,
 
-    metrics: Arc<RemoteTimelineClientMetrics>,
+    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
 
     storage_impl: GenericRemoteStorage,
 
@@ -437,6 +438,19 @@ impl RemoteTimelineClient {
         }
     }
 
+    /// Returns true if this timeline was previously detached at this Lsn and the remote timeline
+    /// client is currently initialized.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // technically this is a dirty read, but given how timeline detach ancestor is implemented
+        // via tenant restart, the lineage has always been uploaded.
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .initialized_mut()
+            .map(|uq| uq.latest_lineage.is_previous_ancestor_lsn(lsn))
+            .unwrap_or(false)
+    }
+
     fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
@@ -448,11 +462,11 @@ impl RemoteTimelineClient {
         } else {
             0
         };
-        self.metrics.remote_physical_size_set(size);
+        self.metrics.remote_physical_size_gauge.set(size);
     }
 
     pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_get()
+        self.metrics.remote_physical_size_gauge.get()
     }
 
     //
@@ -503,8 +517,9 @@ impl RemoteTimelineClient {
     /// On success, returns the size of the downloaded file.
     pub async fn download_layer_file(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
+        local_path: &Utf8Path,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<u64> {
@@ -523,6 +538,7 @@ impl RemoteTimelineClient {
                 self.timeline_id,
                 layer_file_name,
                 layer_metadata,
+                local_path,
                 cancel,
                 ctx,
             )
@@ -596,6 +612,17 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
+    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
+        self: &Arc<Self>,
+        last_aux_file_policy: Option<AuxFilePolicy>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.last_aux_file_policy = last_aux_file_policy;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
@@ -628,7 +655,7 @@ impl RemoteTimelineClient {
         );
 
         let index_part = IndexPart::from(&*upload_queue);
-        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
+        let op = UploadOp::UploadMetadata(Box::new(index_part), disk_consistent_lsn);
         self.metric_begin(&op);
         upload_queue.queued_operations.push_back(op);
         upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;
@@ -647,7 +674,14 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
+            let Some(prev) = upload_queue.latest_metadata.ancestor_timeline() else {
+                return Err(anyhow::anyhow!(
+                    "cannot reparent without a current ancestor"
+                ));
+            };
+
             upload_queue.latest_metadata.reparent(new_parent);
+            upload_queue.latest_lineage.record_previous_ancestor(&prev);
 
             self.schedule_index_upload(upload_queue);
 
@@ -670,14 +704,13 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue
-                .latest_metadata
-                .detach_from_ancestor(&adopted.0, &adopted.1);
+            upload_queue.latest_metadata.detach_from_ancestor(&adopted);
+            upload_queue.latest_lineage.record_detaching(&adopted);
 
             for layer in layers {
                 upload_queue
                     .latest_files
-                    .insert(layer.layer_desc().filename(), layer.metadata());
+                    .insert(layer.layer_desc().layer_name(), layer.metadata());
             }
 
             self.schedule_index_upload(upload_queue);
@@ -713,7 +746,7 @@ impl RemoteTimelineClient {
 
         upload_queue
             .latest_files
-            .insert(layer.layer_desc().filename(), metadata.clone());
+            .insert(layer.layer_desc().layer_name(), metadata.clone());
         upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
 
         info!(
@@ -737,7 +770,7 @@ impl RemoteTimelineClient {
     /// successfully.
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
-        names: &[LayerFileName],
+        names: &[LayerName],
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -765,7 +798,7 @@ impl RemoteTimelineClient {
         // the layer files as "dangling". this is fine, at worst case we create work for the
         // scrubber.
 
-        let names = gc_layers.iter().map(|x| x.layer_desc().filename());
+        let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
 
         self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
 
@@ -780,9 +813,9 @@ impl RemoteTimelineClient {
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
         names: I,
-    ) -> Vec<(LayerFileName, LayerFileMetadata)>
+    ) -> Vec<(LayerName, LayerFileMetadata)>
     where
-        I: IntoIterator<Item = LayerFileName>,
+        I: IntoIterator<Item = LayerName>,
     {
         // Decorate our list of names with each name's metadata, dropping
         // names that are unexpectedly missing from our metadata.  This metadata
@@ -832,7 +865,7 @@ impl RemoteTimelineClient {
     /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
     pub(crate) fn schedule_deletion_of_unlinked(
         self: &Arc<Self>,
-        layers: Vec<(LayerFileName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
     ) -> anyhow::Result<()> {
         let mut guard = self.upload_queue.lock().unwrap();
         let upload_queue = guard.initialized_mut()?;
@@ -845,7 +878,7 @@ impl RemoteTimelineClient {
     fn schedule_deletion_of_unlinked0(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerName, LayerFileMetadata)>,
     ) {
         // Filter out any layers which were not created by this tenant shard.  These are
         // layers that originate from some ancestor shard after a split, and may still
@@ -914,7 +947,7 @@ impl RemoteTimelineClient {
             self.schedule_layer_file_upload0(upload_queue, layer.clone());
         }
 
-        let names = compacted_from.iter().map(|x| x.layer_desc().filename());
+        let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
 
         self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
         self.launch_queued_tasks(upload_queue);
@@ -1108,6 +1141,11 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    pub(crate) fn is_deleting(&self) -> bool {
+        let mut locked = self.upload_queue.lock().unwrap();
+        locked.stopped_mut().is_ok()
+    }
+
     pub(crate) async fn preserve_initdb_archive(
         self: &Arc<Self>,
         tenant_id: &TenantId,
@@ -1144,7 +1182,7 @@ impl RemoteTimelineClient {
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
             self.tenant_shard_id.to_index(),
-            &uploaded.layer_desc().filename(),
+            &uploaded.layer_desc().layer_name(),
             uploaded.metadata().generation,
         );
 
@@ -1185,7 +1223,7 @@ impl RemoteTimelineClient {
                 .get_timeline_id()
                 .expect("Source timeline should be alive"),
             self.tenant_shard_id.to_index(),
-            &adopted.layer_desc().filename(),
+            &adopted.layer_desc().layer_name(),
             adopted.metadata().generation,
         );
 
@@ -1193,7 +1231,7 @@ impl RemoteTimelineClient {
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
             self.tenant_shard_id.to_index(),
-            &adopted_as.layer_desc().filename(),
+            &adopted_as.layer_desc().layer_name(),
             adopted_as.metadata().generation,
         );
 
@@ -1527,7 +1565,7 @@ impl RemoteTimelineClient {
                         &self.tenant_shard_id.tenant_id,
                         &self.timeline_id,
                         layer_metadata.shard,
-                        &layer.layer_desc().filename(),
+                        &layer.layer_desc().layer_name(),
                         layer_metadata.generation,
                     );
 
@@ -1811,6 +1849,7 @@ impl RemoteTimelineClient {
                         latest_files: initialized.latest_files.clone(),
                         latest_files_changes_since_metadata_upload_scheduled: 0,
                         latest_metadata: initialized.latest_metadata.clone(),
+                        latest_lineage: initialized.latest_lineage.clone(),
                         projected_remote_consistent_lsn: None,
                         visible_remote_consistent_lsn: initialized
                             .visible_remote_consistent_lsn
@@ -1824,6 +1863,7 @@ impl RemoteTimelineClient {
                         dangling_files: HashMap::default(),
                         shutting_down: false,
                         shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+                        last_aux_file_policy: initialized.last_aux_file_policy,
                     };
 
                     let upload_queue = std::mem::replace(
@@ -1896,14 +1936,14 @@ pub fn remote_layer_path(
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
     shard: ShardIndex,
-    layer_file_name: &LayerFileName,
+    layer_file_name: &LayerName,
     generation: Generation,
 ) -> RemotePath {
     // Generation-aware key format
     let path = format!(
         "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
         shard.get_suffix(),
-        layer_file_name.file_name(),
+        layer_file_name,
         generation.get_suffix()
     );
 
@@ -2000,8 +2040,8 @@ mod tests {
         TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
     }
 
-    fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
-        let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
+    fn assert_file_list(a: &HashSet<LayerName>, b: &[&str]) {
+        let mut avec: Vec<String> = a.iter().map(|x| x.to_string()).collect();
         avec.sort();
 
         let mut bvec = b.to_vec();
@@ -2112,7 +2152,7 @@ mod tests {
             tenant_ctx: _tenant_ctx,
         } = test_setup;
 
-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;
 
         // Download back the index.json, and check that the list of files is correct
         let initial_index_part = match client
@@ -2127,7 +2167,7 @@ mod tests {
             .layer_metadata
             .keys()
             .map(|f| f.to_owned())
-            .collect::<HashSet<LayerFileName>>();
+            .collect::<HashSet<LayerName>>();
         let initial_layer = {
             assert!(initial_layers.len() == 1);
             initial_layers.into_iter().next().unwrap()
@@ -2153,7 +2193,7 @@ mod tests {
             ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
         ]
         .into_iter()
-        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+        .map(|(name, contents): (LayerName, Vec<u8>)| {
 
             let local_path = local_layer_path(
                 harness.conf,
@@ -2234,9 +2274,9 @@ mod tests {
                 .map(|f| f.to_owned())
                 .collect(),
             &[
-                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[0].layer_desc().layer_name().to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
             ],
         );
         assert_eq!(index_part.metadata, metadata);
@@ -2250,7 +2290,7 @@ mod tests {
         // keep using schedule_layer_file_deletion because we don't have a way to wait for the
         // spawn_blocking started by the drop.
         client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()])
             .unwrap();
         {
             let mut guard = client.upload_queue.lock().unwrap();
@@ -2268,9 +2308,9 @@ mod tests {
         }
         assert_remote_files(
             &[
-                &initial_layer.file_name(),
-                &layers[0].layer_desc().filename().file_name(),
-                &layers[1].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[0].layer_desc().layer_name().to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
                 "index_part.json",
             ],
             &remote_timeline_dir,
@@ -2283,9 +2323,9 @@ mod tests {
 
         assert_remote_files(
             &[
-                &initial_layer.file_name(),
-                &layers[1].layer_desc().filename().file_name(),
-                &layers[2].layer_desc().filename().file_name(),
+                &initial_layer.to_string(),
+                &layers[1].layer_desc().layer_name().to_string(),
+                &layers[2].layer_desc().layer_name().to_string(),
                 "index_part.json",
             ],
             &remote_timeline_dir,
@@ -2303,9 +2343,9 @@ mod tests {
             timeline,
             ..
         } = TestSetup::new("metrics").await.unwrap();
-        let client = timeline.remote_client.as_ref().unwrap();
+        let client = &timeline.remote_client;
 
-        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
         let local_path = local_layer_path(
             harness.conf,
             &timeline.tenant_shard_id,
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c86b22d481..70c5cae05e 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -21,8 +21,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
-use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
@@ -48,21 +47,15 @@ pub async fn download_layer_file<'a>(
     storage: &'a GenericRemoteStorage,
     tenant_shard_id: TenantShardId,
     timeline_id: TimelineId,
-    layer_file_name: &'a LayerFileName,
+    layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
+    local_path: &Utf8Path,
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
-    let local_path = local_layer_path(
-        conf,
-        &tenant_shard_id,
-        &timeline_id,
-        layer_file_name,
-        &layer_metadata.generation,
-    );
 
     let remote_path = remote_layer_path(
         &tenant_shard_id.tenant_id,
@@ -82,7 +75,7 @@ pub async fn download_layer_file<'a>(
     // For more context about durable_rename check this email from postgres mailing list:
     // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
     // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
+    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
         || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
@@ -112,14 +105,17 @@ pub async fn download_layer_file<'a>(
     // We use fatal_err() below because the after the rename above,
     // the in-memory state of the filesystem already has the layer file in its final place,
     // and subsequent pageserver code could think it's durable while it really isn't.
-    let work = async move {
-        let timeline_dir = VirtualFile::open(&timeline_path)
-            .await
-            .fatal_err("VirtualFile::open for timeline dir fsync");
-        timeline_dir
-            .sync_all()
-            .await
-            .fatal_err("VirtualFile::sync_all timeline dir");
+    let work = {
+        let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior());
+        async move {
+            let timeline_dir = VirtualFile::open(&timeline_path, &ctx)
+                .await
+                .fatal_err("VirtualFile::open for timeline dir fsync");
+            timeline_dir
+                .sync_all()
+                .await
+                .fatal_err("VirtualFile::sync_all timeline dir");
+        }
     };
     crate::virtual_file::io_engine::get()
         .spawn_blocking_and_block_on_if_std(work)
@@ -196,7 +192,7 @@ async fn download_object<'a>(
             use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
             use bytes::BytesMut;
             async {
-                let destination_file = VirtualFile::create(dst_path)
+                let destination_file = VirtualFile::create(dst_path, ctx)
                     .await
                     .with_context(|| format!("create a destination file for layer '{dst_path}'"))
                     .map_err(DownloadError::Other)?;
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 7e0619945f..032dda7ff3 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -5,10 +5,12 @@
 use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use serde::{Deserialize, Serialize};
+use utils::id::TimelineId;
 
 use crate::tenant::metadata::TimelineMetadata;
-use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::storage_layer::LayerName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
@@ -75,7 +77,7 @@ pub struct IndexPart {
     ///
     /// Older versions of `IndexPart` will not have this property or have only a part of metadata
     /// that latest version stores.
-    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
 
     // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
     // It's duplicated for convenience when reading the serialized structure, but is
@@ -84,6 +86,19 @@ pub struct IndexPart {
 
     #[serde(rename = "metadata_bytes")]
     pub metadata: TimelineMetadata,
+
+    #[serde(default)]
+    pub(crate) lineage: Lineage,
+
+    /// Describes the kind of aux files stored in the timeline.
+    ///
+    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
+    /// A V1 setting after V2 files have been committed is not accepted.
+    ///
+    /// None means no aux files have been written to the storage before the point
+    /// when this flag is introduced.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
 }
 
 impl IndexPart {
@@ -96,17 +111,21 @@ impl IndexPart {
     /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
     ///      is always generated from the keys of `layer_metadata`)
     /// - 4: timeline_layers is fully removed.
-    const LATEST_VERSION: usize = 4;
+    /// - 5: lineage was added
+    /// - 6: last_aux_file_policy is added.
+    const LATEST_VERSION: usize = 6;
 
     // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
 
     pub const FILE_NAME: &'static str = "index_part.json";
 
     fn new(
-        layers_and_metadata: &HashMap<LayerFileName, LayerFileMetadata>,
+        layers_and_metadata: &HashMap<LayerName, LayerFileMetadata>,
         disk_consistent_lsn: Lsn,
         metadata: TimelineMetadata,
+        lineage: Lineage,
+        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> Self {
         let layer_metadata = layers_and_metadata
             .iter()
@@ -119,6 +138,8 @@ impl IndexPart {
             disk_consistent_lsn,
             metadata,
             deleted_at: None,
+            lineage,
+            last_aux_file_policy,
         }
     }
 
@@ -147,16 +168,29 @@ impl IndexPart {
             &HashMap::new(),
             example_metadata.disk_consistent_lsn(),
             example_metadata,
+            Default::default(),
+            Some(AuxFilePolicy::V1),
         )
     }
+
+    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
+        self.last_aux_file_policy
+    }
 }
 
 impl From<&UploadQueueInitialized> for IndexPart {
     fn from(uq: &UploadQueueInitialized) -> Self {
         let disk_consistent_lsn = uq.latest_metadata.disk_consistent_lsn();
         let metadata = uq.latest_metadata.clone();
+        let lineage = uq.latest_lineage.clone();
 
-        Self::new(&uq.latest_files, disk_consistent_lsn, metadata)
+        Self::new(
+            &uq.latest_files,
+            disk_consistent_lsn,
+            metadata,
+            lineage,
+            uq.last_aux_file_policy,
+        )
     }
 }
 
@@ -184,8 +218,76 @@ impl From<&LayerFileMetadata> for IndexLayerMetadata {
     }
 }
 
+/// Limited history of earlier ancestors.
+///
+/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly
+/// reparented by having an later timeline be detached from it's ancestor.
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
+pub(crate) struct Lineage {
+    /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`].
+    #[serde(skip_serializing_if = "is_false", default)]
+    reparenting_history_truncated: bool,
+
+    /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`]
+    ///
+    /// These are stored in case we want to support WAL based DR on the timeline. There can be many
+    /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings
+    /// after [`Self::original_ancestor`] has been set.
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    reparenting_history: Vec<TimelineId>,
+
+    /// The ancestor from which this timeline has been detached from and when.
+    ///
+    /// If you are adding support for detaching from a hierarchy, consider changing the ancestry
+    /// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
+}
+
+fn is_false(b: &bool) -> bool {
+    !b
+}
+
+impl Lineage {
+    const REMEMBER_AT_MOST: usize = 100;
+
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+        if self.reparenting_history.last() == Some(old_ancestor) {
+            // do not re-record it
+            return;
+        }
+
+        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
+
+        self.reparenting_history_truncated |= drop_oldest;
+        if drop_oldest {
+            self.reparenting_history.remove(0);
+        }
+        self.reparenting_history.push(*old_ancestor);
+    }
+
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
+        assert!(self.original_ancestor.is_none());
+
+        self.original_ancestor =
+            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    }
+
+    /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
+    /// to start a read/write primary at this lsn".
+    ///
+    /// Returns true if the Lsn was previously a branch point.
+    pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        self.original_ancestor
+            .as_ref()
+            .is_some_and(|(_, ancestor_lsn, _)| lsn == *ancestor_lsn)
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use std::str::FromStr;
+
     use super::*;
 
     #[test]
@@ -221,6 +323,8 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -261,6 +365,8 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -302,7 +408,9 @@ mod tests {
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
             deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -347,6 +455,8 @@ mod tests {
             ])
             .unwrap(),
             deleted_at: None,
+            lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -385,11 +495,110 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            lineage: Lineage::default(),
+            last_aux_file_policy: None,
         };
 
         let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
+
+    #[test]
+    fn v5_indexpart_is_parsed() {
+        let example = r#"{
+            "version":5,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1},
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}},
+                "disk_consistent_lsn":"0/15A7618",
+                "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+                "lineage":{
+                    "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                    "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+                }
+        }"#;
+
+        let expected = IndexPart {
+            version: 5,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 23289856,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 1015808,
+                    generation: Generation::new(1),
+                    shard: ShardIndex::unsharded(),
+                })
+            ]),
+            disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: None,
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+            last_aux_file_policy: None,
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    #[test]
+    fn v6_indexpart_is_parsed() {
+        let example = r#"{
+            "version":6,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123",
+            "lineage":{
+                "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
+                "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
+            },
+            "last_aux_file_policy": "V2"
+        }"#;
+
+        let expected = IndexPart {
+            version: 6,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            lineage: Lineage {
+                reparenting_history_truncated: false,
+                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
+                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
+            },
+            last_aux_file_policy: Some(AuxFilePolicy::V2),
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
+    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
+        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
+    }
 }
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 0bb25f0ace..252b6eb11b 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,11 +6,9 @@ mod scheduler;
 use std::{sync::Arc, time::SystemTime};
 
 use crate::{
-    config::PageServerConf,
     context::RequestContext,
     disk_usage_eviction_task::DiskUsageEvictionInfo,
     task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
-    virtual_file::MaybeFatalIo,
 };
 
 use self::{
@@ -21,9 +19,8 @@ use self::{
 use super::{
     config::{SecondaryLocationConfig, TenantConfOpt},
     mgr::TenantManager,
-    remote_timeline_client::LayerFileMetadata,
     span::debug_assert_current_span_has_tenant_id,
-    storage_layer::{layer::local_layer_path, LayerFileName},
+    storage_layer::LayerName,
 };
 
 use pageserver_api::{
@@ -178,13 +175,7 @@ impl SecondaryTenant {
 
     /// Cancellation safe, but on cancellation the eviction will go through
     #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
-    pub(crate) async fn evict_layer(
-        self: &Arc<Self>,
-        conf: &PageServerConf,
-        timeline_id: TimelineId,
-        name: LayerFileName,
-        metadata: LayerFileMetadata,
-    ) {
+    pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
         debug_assert_current_span_has_tenant_id();
 
         let guard = match self.gate.enter() {
@@ -197,41 +188,11 @@ impl SecondaryTenant {
 
         let now = SystemTime::now();
 
-        let local_path = local_layer_path(
-            conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &name,
-            &metadata.generation,
-        );
-
         let this = self.clone();
 
         // spawn it to be cancellation safe
         tokio::task::spawn_blocking(move || {
             let _guard = guard;
-            // We tolerate ENOENT, because between planning eviction and executing
-            // it, the secondary downloader could have seen an updated heatmap that
-            // resulted in a layer being deleted.
-            // Other local I/O errors are process-fatal: these should never happen.
-            let deleted = std::fs::remove_file(local_path);
-
-            let not_found = deleted
-                .as_ref()
-                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
-
-            let deleted = if not_found {
-                false
-            } else {
-                deleted
-                    .map(|()| true)
-                    .fatal_err("Deleting layer during eviction")
-            };
-
-            if !deleted {
-                // skip updating accounting and putting perhaps later timestamp
-                return;
-            }
 
             // Update the timeline's state.  This does not have to be synchronized with
             // the download process, because:
@@ -250,8 +211,15 @@ impl SecondaryTenant {
             // of the cache.
             let mut detail = this.detail.lock().unwrap();
             if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                timeline_detail.on_disk_layers.remove(&name);
-                timeline_detail.evicted_at.insert(name, now);
+                let removed = timeline_detail.on_disk_layers.remove(&name);
+
+                // We might race with removal of the same layer during downloads, if it was removed
+                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
+                // do a physical deletion or store in evicted_at.
+                if let Some(removed) = removed {
+                    removed.remove_blocking();
+                    timeline_detail.evicted_at.insert(name, now);
+                }
             }
         })
         .await
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 092630e74d..870475eb57 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -22,11 +22,11 @@ use crate::{
             FAILED_REMOTE_OP_RETRIES,
         },
         span::debug_assert_current_span_has_tenant_id,
-        storage_layer::{layer::local_layer_path, LayerFileName},
+        storage_layer::{layer::local_layer_path, LayerName},
         tasks::{warn_when_period_overrun, BackgroundLoopKind},
     },
     virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+    TEMP_FILE_SUFFIX,
 };
 
 use super::{
@@ -45,10 +45,10 @@ use crate::tenant::{
 
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
+use futures::{Future, StreamExt};
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -62,14 +62,16 @@ use super::{
     CommandRequest, DownloadCommand,
 };
 
-/// For each tenant, how long must have passed since the last download_tenant call before
-/// calling it again.  This is approximately the time by which local data is allowed
-/// to fall behind remote data.
-///
-/// TODO: this should just be a default, and the actual period should be controlled
-/// via the heatmap itself
-/// `<ttps://github.com/neondatabase/neon/issues/6200>`
-const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+/// For each tenant, default period for how long must have passed since the last download_tenant call before
+/// calling it again.  This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
+/// download, if the uploader populated it.
+const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
+
+/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
+/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
+/// `PageServerConf::secondary_download_concurrency`
+const MAX_LAYER_CONCURRENCY: usize = 16;
+const MIN_LAYER_CONCURRENCY: usize = 1;
 
 pub(super) async fn downloader_task(
     tenant_manager: Arc<TenantManager>,
@@ -79,14 +81,15 @@ pub(super) async fn downloader_task(
     cancel: CancellationToken,
     root_ctx: RequestContext,
 ) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    // How many tenants' secondary download operations we will run concurrently
+    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
 
     let generator = SecondaryDownloader {
         tenant_manager,
         remote_storage,
         root_ctx,
     };
-    let mut scheduler = Scheduler::new(generator, concurrency);
+    let mut scheduler = Scheduler::new(generator, tenant_concurrency);
 
     scheduler
         .run(command_queue, background_jobs_can_start, cancel)
@@ -104,6 +107,7 @@ struct SecondaryDownloader {
 pub(super) struct OnDiskState {
     metadata: LayerFileMetadata,
     access_time: SystemTime,
+    local_path: Utf8PathBuf,
 }
 
 impl OnDiskState {
@@ -111,23 +115,46 @@ impl OnDiskState {
         _conf: &'static PageServerConf,
         _tenant_shard_id: &TenantShardId,
         _imeline_id: &TimelineId,
-        _ame: LayerFileName,
+        _ame: LayerName,
         metadata: LayerFileMetadata,
         access_time: SystemTime,
+        local_path: Utf8PathBuf,
     ) -> Self {
         Self {
             metadata,
             access_time,
+            local_path,
         }
     }
+
+    // This is infallible, because all errors are either acceptable (ENOENT), or totally
+    // unexpected (fatal).
+    pub(super) fn remove_blocking(&self) {
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        std::fs::remove_file(&self.local_path)
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting secondary layer")
+    }
 }
 
 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
 
     /// We remember when layers were evicted, to prevent re-downloading them.
-    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
+}
+
+// Aspects of a heatmap that we remember after downloading it
+#[derive(Clone, Debug)]
+struct DownloadSummary {
+    etag: Etag,
+    #[allow(unused)]
+    mtime: SystemTime,
+    upload_period: Duration,
 }
 
 /// This state is written by the secondary downloader, it is opaque
@@ -136,8 +163,7 @@ pub(super) struct SecondaryDetailTimeline {
 pub(super) struct SecondaryDetail {
     pub(super) config: SecondaryLocationConfig,
 
-    last_download: Option<Instant>,
-    last_etag: Option<Etag>,
+    last_download: Option<DownloadSummary>,
     next_download: Option<Instant>,
     pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -167,7 +193,6 @@ impl SecondaryDetail {
         Self {
             config,
             last_download: None,
-            last_etag: None,
             next_download: None,
             timelines: HashMap::new(),
         }
@@ -221,9 +246,8 @@ impl SecondaryDetail {
 
 struct PendingDownload {
     secondary_state: Arc<SecondaryTenant>,
-    last_download: Option<Instant>,
+    last_download: Option<DownloadSummary>,
     target_time: Option<Instant>,
-    period: Option<Duration>,
 }
 
 impl scheduler::PendingJob for PendingDownload {
@@ -273,10 +297,17 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
         tracing::debug!("Secondary tenant download completed");
 
-        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
-        // take priority to run again.
         let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
+
+        let period = detail
+            .last_download
+            .as_ref()
+            .map(|d| d.upload_period)
+            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+
+        // We advance next_download irrespective of errors: we don't want error cases to result in
+        // expensive busy-polling.
+        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -309,11 +340,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                     if detail.next_download.is_none() {
                         // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
                         // rounds will use a smaller jitter to avoid accidentally synchronizing later.
-                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
+                        detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
                         "Using our constant, which is known to be small compared with clock range",
                     ));
                     }
-                    (detail.last_download, detail.next_download.unwrap())
+                    (detail.last_download.clone(), detail.next_download.unwrap())
                 };
 
                 if now > next_download {
@@ -321,7 +352,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                         secondary_state: secondary_tenant,
                         last_download,
                         target_time: Some(next_download),
-                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
                     })
                 } else {
                     None
@@ -347,7 +377,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
         Ok(PendingDownload {
             target_time: None,
-            period: None,
             last_download: None,
             secondary_state: tenant,
         })
@@ -364,7 +393,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             secondary_state,
             last_download,
             target_time,
-            period,
         } = job;
 
         let (completion, barrier) = utils::completion::channel();
@@ -401,20 +429,15 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
 
             // If the job had a target execution time, we may check our final execution
             // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Only track execution lag if this isn't our first download: otherwise, it is expected
-                // that execution will have taken longer than our configured interval, for example
-                // when starting up a pageserver and
-                if last_download.is_some() {
-                    // Elapsed time includes any scheduling lag as well as the execution of the job
-                    let elapsed = Instant::now().duration_since(target_time);
+            if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = Instant::now().duration_since(target_time);
 
-                    warn_when_period_overrun(
-                        elapsed,
-                        period,
-                        BackgroundLoopKind::SecondaryDownload,
-                    );
-                }
+                warn_when_period_overrun(
+                    elapsed,
+                    last_download.upload_period,
+                    BackgroundLoopKind::SecondaryDownload,
+                );
             }
 
             CompleteDownload {
@@ -503,12 +526,12 @@ impl<'a> TenantDownloader<'a> {
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
 
         // We will use the etag from last successful download to make the download conditional on changes
-        let last_etag = self
+        let last_download = self
             .secondary_state
             .detail
             .lock()
             .unwrap()
-            .last_etag
+            .last_download
             .clone();
 
         // Download the tenant's heatmap
@@ -517,7 +540,7 @@ impl<'a> TenantDownloader<'a> {
             etag: heatmap_etag,
             bytes: heatmap_bytes,
         } = match tokio::select!(
-            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
+            bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
             _ = self.secondary_state.cancel.cancelled() => return Ok(())
         ) {
             HeatMapDownload::Unmodified => {
@@ -546,6 +569,39 @@ impl<'a> TenantDownloader<'a> {
             heatmap.timelines.len()
         );
 
+        // Get or initialize the local disk state for the timelines we will update
+        let mut timeline_states = HashMap::new();
+        for timeline in &heatmap.timelines {
+            let timeline_state = self
+                .secondary_state
+                .detail
+                .lock()
+                .unwrap()
+                .timelines
+                .get(&timeline.timeline_id)
+                .cloned();
+
+            let timeline_state = match timeline_state {
+                Some(t) => t,
+                None => {
+                    // We have no existing state: need to scan local disk for layers first.
+                    let timeline_state =
+                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+
+                    // Re-acquire detail lock now that we're done with async load from local FS
+                    self.secondary_state
+                        .detail
+                        .lock()
+                        .unwrap()
+                        .timelines
+                        .insert(timeline.timeline_id, timeline_state.clone());
+                    timeline_state
+                }
+            };
+
+            timeline_states.insert(timeline.timeline_id, timeline_state);
+        }
+
         // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
         // principle that deletions should be done before writes wherever possible, and so that we can use this
         // phase to initialize our SecondaryProgress.
@@ -556,6 +612,10 @@ impl<'a> TenantDownloader<'a> {
 
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
+            let timeline_state = timeline_states
+                .remove(&timeline.timeline_id)
+                .expect("Just populated above");
+
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!(
                     "Cancelled before downloading timeline {}",
@@ -565,7 +625,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, ctx)
+            self.download_timeline(timeline, timeline_state, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -577,7 +637,30 @@ impl<'a> TenantDownloader<'a> {
 
         // Only update last_etag after a full successful download: this way will not skip
         // the next download, even if the heatmap's actual etag is unchanged.
-        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
+        self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
+            etag: heatmap_etag,
+            mtime: heatmap_mtime,
+            upload_period: heatmap
+                .upload_period_ms
+                .map(|ms| Duration::from_millis(ms as u64))
+                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
+        });
+
+        // Robustness: we should have updated progress properly, but in case we didn't, make sure
+        // we don't leave the tenant in a state where we claim to have successfully downloaded
+        // everything, but our progress is incomplete.  The invariant here should be that if
+        // we have set `last_download` to this heatmap's etag, then the next time we see that
+        // etag we can safely do no work (i.e. we must be complete).
+        let mut progress = self.secondary_state.progress.lock().unwrap();
+        debug_assert!(progress.layers_downloaded == progress.layers_total);
+        debug_assert!(progress.bytes_downloaded == progress.bytes_total);
+        if progress.layers_downloaded != progress.layers_total
+            || progress.bytes_downloaded != progress.bytes_total
+        {
+            tracing::warn!("Correcting drift in progress stats ({progress:?})");
+            progress.layers_downloaded = progress.layers_total;
+            progress.bytes_downloaded = progress.bytes_total;
+        }
 
         Ok(())
     }
@@ -754,6 +837,7 @@ impl<'a> TenantDownloader<'a> {
     async fn download_timeline(
         &self,
         timeline: HeatMapTimeline,
+        timeline_state: SecondaryDetailTimeline,
         ctx: &RequestContext,
     ) -> Result<(), UpdateError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
@@ -762,36 +846,10 @@ impl<'a> TenantDownloader<'a> {
         // Accumulate updates to the state
         let mut touched = Vec::new();
 
-        // Clone a view of what layers already exist on disk
-        let timeline_state = self
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .timelines
-            .get(&timeline.timeline_id)
-            .cloned();
-
-        let timeline_state = match timeline_state {
-            Some(t) => t,
-            None => {
-                // We have no existing state: need to scan local disk for layers first.
-                let timeline_state =
-                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
-
-                // Re-acquire detail lock now that we're done with async load from local FS
-                self.secondary_state
-                    .detail
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(timeline.timeline_id, timeline_state.clone());
-                timeline_state
-            }
-        };
-
         tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
 
+        let mut download_futs = Vec::new();
+
         // Download heatmap layers that are not present on local disk, or update their
         // access time if they are already present.
         for layer in timeline.layers {
@@ -807,20 +865,12 @@ impl<'a> TenantDownloader<'a> {
                 if cfg!(debug_assertions) {
                     // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                     // are already present on disk are really there.
-                    let local_path = local_layer_path(
-                        self.conf,
-                        tenant_shard_id,
-                        &timeline.timeline_id,
-                        &layer.name,
-                        &layer.metadata.generation,
-                    );
-
-                    match tokio::fs::metadata(&local_path).await {
+                    match tokio::fs::metadata(&on_disk.local_path).await {
                         Ok(meta) => {
                             tracing::debug!(
                                 "Layer {} present at {}, size {}",
                                 layer.name,
-                                local_path,
+                                on_disk.local_path,
                                 meta.len(),
                             );
                         }
@@ -828,7 +878,7 @@ impl<'a> TenantDownloader<'a> {
                             tracing::warn!(
                                 "Layer {} not found at {} ({})",
                                 layer.name,
-                                local_path,
+                                on_disk.local_path,
                                 e
                             );
                             debug_assert!(false);
@@ -874,67 +924,33 @@ impl<'a> TenantDownloader<'a> {
                 }
             }
 
-            // Failpoint for simulating slow remote storage
-            failpoint_support::sleep_millis_async!(
-                "secondary-layer-download-sleep",
-                &self.secondary_state.cancel
-            );
-
-            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-            let downloaded_bytes = match download_layer_file(
-                self.conf,
-                self.remote_storage,
-                *tenant_shard_id,
-                timeline.timeline_id,
-                &layer.name,
-                &LayerFileMetadata::from(&layer.metadata),
-                &self.secondary_state.cancel,
+            download_futs.push(self.download_layer(
+                tenant_shard_id,
+                &timeline.timeline_id,
+                layer,
                 ctx,
-            )
-            .await
-            {
-                Ok(bytes) => bytes,
-                Err(DownloadError::NotFound) => {
-                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                    // This is harmless: continue to download the next layer. It is expected during compaction
-                    // GC.
-                    tracing::debug!(
-                        "Skipped downloading missing layer {}, raced with compaction/gc?",
-                        layer.name
-                    );
-                    continue;
+            ));
+        }
+
+        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
+        // concurrency to use based on activity level of remote storage.
+        while !download_futs.is_empty() {
+            let chunk =
+                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
+
+            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
+
+            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
+            let mut result_stream = std::pin::pin!(result_stream);
+            while let Some(result) = result_stream.next().await {
+                match result {
+                    Err(e) => return Err(e),
+                    Ok(None) => {
+                        // No error, but we didn't download the layer.  Don't mark it touched
+                    }
+                    Ok(Some(layer)) => touched.push(layer),
                 }
-                Err(e) => return Err(e.into()),
-            };
-
-            if downloaded_bytes != layer.metadata.file_size {
-                let local_path = local_layer_path(
-                    self.conf,
-                    tenant_shard_id,
-                    &timeline.timeline_id,
-                    &layer.name,
-                    &layer.metadata.generation,
-                );
-
-                tracing::warn!(
-                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
-                    layer.name,
-                    downloaded_bytes,
-                    layer.metadata.file_size
-                );
-
-                tokio::fs::remove_file(&local_path)
-                    .await
-                    .or_else(fs_ext::ignore_not_found)?;
-            } else {
-                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.bytes_downloaded += downloaded_bytes;
-                progress.layers_downloaded += 1;
             }
-
-            SECONDARY_MODE.download_layer.inc();
-            touched.push(layer)
         }
 
         // Write updates to state to record layers we just downloaded or touched.
@@ -951,6 +967,13 @@ impl<'a> TenantDownloader<'a> {
                         v.get_mut().access_time = t.access_time;
                     }
                     Entry::Vacant(e) => {
+                        let local_path = local_layer_path(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            &t.name,
+                            &t.metadata.generation,
+                        );
                         e.insert(OnDiskState::new(
                             self.conf,
                             tenant_shard_id,
@@ -958,6 +981,7 @@ impl<'a> TenantDownloader<'a> {
                             t.name,
                             LayerFileMetadata::from(&t.metadata),
                             t.access_time,
+                            local_path,
                         ));
                     }
                 }
@@ -966,6 +990,107 @@ impl<'a> TenantDownloader<'a> {
 
         Ok(())
     }
+
+    async fn download_layer(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        layer: HeatMapLayer,
+        ctx: &RequestContext,
+    ) -> Result<Option<HeatMapLayer>, UpdateError> {
+        // Failpoint for simulating slow remote storage
+        failpoint_support::sleep_millis_async!(
+            "secondary-layer-download-sleep",
+            &self.secondary_state.cancel
+        );
+
+        let local_path = local_layer_path(
+            self.conf,
+            tenant_shard_id,
+            timeline_id,
+            &layer.name,
+            &layer.metadata.generation,
+        );
+
+        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        let downloaded_bytes = match download_layer_file(
+            self.conf,
+            self.remote_storage,
+            *tenant_shard_id,
+            *timeline_id,
+            &layer.name,
+            &LayerFileMetadata::from(&layer.metadata),
+            &local_path,
+            &self.secondary_state.cancel,
+            ctx,
+        )
+        .await
+        {
+            Ok(bytes) => bytes,
+            Err(DownloadError::NotFound) => {
+                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                // This is harmless: continue to download the next layer. It is expected during compaction
+                // GC.
+                tracing::debug!(
+                    "Skipped downloading missing layer {}, raced with compaction/gc?",
+                    layer.name
+                );
+
+                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.layers_total = progress.layers_total.saturating_sub(1);
+                progress.bytes_total = progress
+                    .bytes_total
+                    .saturating_sub(layer.metadata.file_size);
+
+                return Ok(None);
+            }
+            Err(e) => return Err(e.into()),
+        };
+
+        if downloaded_bytes != layer.metadata.file_size {
+            let local_path = local_layer_path(
+                self.conf,
+                tenant_shard_id,
+                timeline_id,
+                &layer.name,
+                &layer.metadata.generation,
+            );
+
+            tracing::warn!(
+                "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                layer.name,
+                downloaded_bytes,
+                layer.metadata.file_size
+            );
+
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)?;
+        } else {
+            tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+            let mut progress = self.secondary_state.progress.lock().unwrap();
+            progress.bytes_downloaded += downloaded_bytes;
+            progress.layers_downloaded += 1;
+        }
+
+        SECONDARY_MODE.download_layer.inc();
+
+        Ok(Some(layer))
+    }
+
+    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
+    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
+        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
+        // of our concurrency range to the units available within the remaining 25%.
+        let clamp_at = (activity.read_total * 3) / 4;
+        if activity.read_available > clamp_at {
+            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
+                / (activity.read_total - clamp_at)
+        } else {
+            MIN_LAYER_CONCURRENCY
+        }
+    }
 }
 
 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -997,7 +1122,7 @@ async fn init_timeline_state(
 
     // As we iterate through layers found on disk, we will look up their metadata from this map.
     // Layers not present in metadata will be discarded.
-    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
+    let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         heatmap.layers.iter().map(|l| (&l.name, l)).collect();
 
     while let Some(dentry) = dir
@@ -1015,11 +1140,7 @@ async fn init_timeline_state(
             .fatal_err(&format!("Read metadata on {}", file_path));
 
         let file_name = file_path.file_name().expect("created it from the dentry");
-        if file_name == METADATA_FILE_NAME {
-            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
-            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
-            continue;
-        } else if crate::is_temporary(&file_path)
+        if crate::is_temporary(&file_path)
             || is_temp_download_file(&file_path)
             || is_ephemeral_file(file_name)
         {
@@ -1034,7 +1155,7 @@ async fn init_timeline_state(
             continue;
         }
 
-        match LayerFileName::from_str(file_name) {
+        match LayerName::from_str(file_name) {
             Ok(name) => {
                 let remote_meta = heatmap_metadata.get(&name);
                 match remote_meta {
@@ -1061,6 +1182,7 @@ async fn init_timeline_state(
                                     name,
                                     LayerFileMetadata::from(&remote_meta.metadata),
                                     remote_meta.access_time,
+                                    file_path,
                                 ),
                             );
                         }
@@ -1092,3 +1214,58 @@ async fn init_timeline_state(
 
     detail
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn layer_concurrency() {
+        // Totally idle
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 16,
+                read_total: 16,
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY
+        );
+
+        // Totally busy
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 0,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Edge of the range at which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 12,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Midpoint of the range in which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 14,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY / 2
+        );
+    }
+}
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 73cdf6c6d4..2da4a3b9d5 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,8 +1,6 @@
 use std::time::SystemTime;
 
-use crate::tenant::{
-    remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
-};
+use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
 
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -17,6 +15,14 @@ pub(super) struct HeatMapTenant {
     pub(super) generation: Generation,
 
     pub(super) timelines: Vec<HeatMapTimeline>,
+
+    /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders
+    /// of how frequently it is worthwhile to check for updates.
+    ///
+    /// This is optional for backward compat, and because we sometimes might upload
+    /// a heatmap explicitly via API for a tenant that has no periodic upload configured.
+    #[serde(default)]
+    pub(super) upload_period_ms: Option<u128>,
 }
 
 #[serde_as]
@@ -31,7 +37,7 @@ pub(crate) struct HeatMapTimeline {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
-    pub(super) name: LayerFileName,
+    pub(super) name: LayerName,
     pub(super) metadata: IndexLayerMetadata,
 
     #[serde_as(as = "TimestampSeconds<i64>")]
@@ -42,7 +48,7 @@ pub(crate) struct HeatMapLayer {
 
 impl HeatMapLayer {
     pub(crate) fn new(
-        name: LayerFileName,
+        name: LayerName,
         metadata: IndexLayerMetadata,
         access_time: SystemTime,
     ) -> Self {
@@ -83,4 +89,21 @@ impl HeatMapTenant {
 
         stats
     }
+
+    pub(crate) fn strip_atimes(self) -> Self {
+        Self {
+            timelines: self
+                .timelines
+                .into_iter()
+                .map(|mut tl| {
+                    for layer in &mut tl.layers {
+                        layer.access_time = SystemTime::UNIX_EPOCH;
+                    }
+                    tl
+                })
+                .collect(),
+            generation: self.generation,
+            upload_period_ms: self.upload_period_ms,
+        }
+    }
 }
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 352409f5fc..fddced3ead 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress {
 
 struct UploadPending {
     tenant: Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
     target_time: Option<Instant>,
     period: Option<Duration>,
 }
@@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending {
 struct WriteComplete {
     tenant_shard_id: TenantShardId,
     completed_at: Instant,
-    digest: Option<md5::Digest>,
+    uploaded: Option<LastUploadState>,
     next_upload: Option<Instant>,
 }
 
@@ -115,10 +115,7 @@ struct UploaderTenantState {
     tenant: Weak<Tenant>,
 
     /// Digest of the serialized heatmap that we last successfully uploaded
-    ///
-    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
-    /// which is also an md5sum.
-    last_digest: Option<md5::Digest>,
+    last_upload_state: Option<LastUploadState>,
 
     /// When the last upload attempt completed (may have been successful or failed)
     last_upload: Option<Instant>,
@@ -187,7 +184,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                     tenant: Arc::downgrade(&tenant),
                     last_upload: None,
                     next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
-                    last_digest: None,
+                    last_upload_state: None,
                 });
 
             // Decline to do the upload if insufficient time has passed
@@ -195,10 +192,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                 return;
             }
 
-            let last_digest = state.last_digest;
+            let last_upload = state.last_upload_state.clone();
             result.jobs.push(UploadPending {
                 tenant,
-                last_digest,
+                last_upload,
                 target_time: state.next_upload,
                 period: Some(period),
             });
@@ -218,7 +215,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
     ) {
         let UploadPending {
             tenant,
-            last_digest,
+            last_upload,
             target_time,
             period,
         } = job;
@@ -231,16 +228,16 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             let _completion = completion;
 
             let started_at = Instant::now();
-            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
-                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
+            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
+                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
                     let duration = Instant::now().duration_since(started_at);
                     SECONDARY_MODE
                         .upload_heatmap_duration
                         .observe(duration.as_secs_f64());
                     SECONDARY_MODE.upload_heatmap.inc();
-                    Some(digest)
+                    Some(uploaded)
                 }
-                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
+                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
                 Err(UploadHeatmapError::Upload(e)) => {
                     tracing::warn!(
                         "Failed to upload heatmap for tenant {}: {e:#}",
@@ -251,11 +248,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                         .upload_heatmap_duration
                         .observe(duration.as_secs_f64());
                     SECONDARY_MODE.upload_heatmap_errors.inc();
-                    last_digest
+                    last_upload
                 }
                 Err(UploadHeatmapError::Cancelled) => {
                     tracing::info!("Cancelled heatmap upload, shutting down");
-                    last_digest
+                    last_upload
                 }
             };
 
@@ -277,7 +274,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
                     completed_at: now,
-                    digest,
+                    uploaded,
                     next_upload,
                 }
         }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
@@ -299,7 +296,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
         Ok(UploadPending {
             // Ignore our state for last digest: this forces an upload even if nothing has changed
-            last_digest: None,
+            last_upload: None,
             tenant,
             target_time: None,
             period: None,
@@ -312,7 +309,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
         let WriteComplete {
             tenant_shard_id,
             completed_at,
-            digest,
+            uploaded,
             next_upload,
         } = completion;
         use std::collections::hash_map::Entry;
@@ -322,7 +319,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             }
             Entry::Occupied(mut entry) => {
                 entry.get_mut().last_upload = Some(completed_at);
-                entry.get_mut().last_digest = digest;
+                entry.get_mut().last_upload_state = uploaded;
                 entry.get_mut().next_upload = next_upload
             }
         }
@@ -331,7 +328,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
 
 enum UploadHeatmapOutcome {
     /// We successfully wrote to remote storage, with this digest.
-    Uploaded(md5::Digest),
+    Uploaded(LastUploadState),
     /// We did not upload because the heatmap digest was unchanged since the last upload
     NoChange,
     /// We skipped the upload for some reason, such as tenant/timeline not ready
@@ -347,12 +344,25 @@ enum UploadHeatmapError {
     Upload(#[from] anyhow::Error),
 }
 
+/// Digests describing the heatmap we most recently uploaded successfully.
+///
+/// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
+/// which is also an md5sum.
+#[derive(Clone)]
+struct LastUploadState {
+    // Digest of json-encoded HeatMapTenant
+    uploaded_digest: md5::Digest,
+
+    // Digest without atimes set.
+    layers_only_digest: md5::Digest,
+}
+
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
     remote_storage: GenericRemoteStorage,
     tenant: &Arc<Tenant>,
-    last_digest: Option<md5::Digest>,
+    last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
     debug_assert_current_span_has_tenant_id();
 
@@ -368,6 +378,7 @@ async fn upload_tenant_heatmap(
     let mut heatmap = HeatMapTenant {
         timelines: Vec::new(),
         generation,
+        upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()),
     };
     let timelines = tenant.timelines.lock().unwrap().clone();
 
@@ -396,15 +407,31 @@ async fn upload_tenant_heatmap(
 
     // Serialize the heatmap
     let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
-    let bytes = bytes::Bytes::from(bytes);
-    let size = bytes.len();
 
     // Drop out early if nothing changed since our last upload
     let digest = md5::compute(&bytes);
-    if Some(digest) == last_digest {
+    if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) {
         return Ok(UploadHeatmapOutcome::NoChange);
     }
 
+    // Calculate a digest that omits atimes, so that we can distinguish actual changes in
+    // layers from changes only in atimes.
+    let heatmap_size_bytes = heatmap.get_stats().bytes;
+    let layers_only_bytes =
+        serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?;
+    let layers_only_digest = md5::compute(&layers_only_bytes);
+    if heatmap_size_bytes < tenant.get_checkpoint_distance() {
+        // For small tenants, skip upload if only atimes changed. This avoids doing frequent
+        // uploads from long-idle tenants whose atimes are just incremented by periodic
+        // size calculations.
+        if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) {
+            return Ok(UploadHeatmapOutcome::NoChange);
+        }
+    }
+
+    let bytes = bytes::Bytes::from(bytes);
+    let size = bytes.len();
+
     let path = remote_heatmap_path(tenant.get_tenant_shard_id());
 
     let cancel = &tenant.cancel;
@@ -436,5 +463,8 @@ async fn upload_tenant_heatmap(
 
     tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
 
-    Ok(UploadHeatmapOutcome::Uploaded(digest))
+    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
+        uploaded_digest: digest,
+        layers_only_digest,
+    }))
 }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4f1b56ef9f..9ccf20c0d4 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,11 +1,11 @@
 //! Common traits and structs for layers
 
 pub mod delta_layer;
-mod filename;
 pub mod image_layer;
 pub(crate) mod inmemory_layer;
 pub(crate) mod layer;
 mod layer_desc;
+mod layer_name;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
@@ -34,10 +34,10 @@ use utils::rate_limit::RateLimit;
 use utils::{id::TimelineId, lsn::Lsn};
 
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
-pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
+pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
 
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 
@@ -113,12 +113,20 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
     }
 }
 
-/// Bag of data accumulated during a vectored get
+/// Bag of data accumulated during a vectored get..
 pub(crate) struct ValuesReconstructState {
+    /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
+    /// should not expect to get anything from this hashmap.
     pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
-
+    /// The keys which are already retrieved
     keys_done: KeySpaceRandomAccum,
+
+    /// The keys covered by the image layers
+    keys_with_image_coverage: Option<Range<Key>>,
+
+    // Statistics that are still accessible as a caller of `get_vectored_impl`.
     layers_visited: u32,
+    delta_layers_visited: u32,
 }
 
 impl ValuesReconstructState {
@@ -126,7 +134,9 @@ impl ValuesReconstructState {
         Self {
             keys: HashMap::new(),
             keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
             layers_visited: 0,
+            delta_layers_visited: 0,
         }
     }
 
@@ -140,8 +150,17 @@ impl ValuesReconstructState {
         }
     }
 
-    pub(crate) fn on_layer_visited(&mut self) {
+    pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
         self.layers_visited += 1;
+        if let ReadableLayer::PersistentLayer(layer) = layer {
+            if layer.layer_desc().is_delta() {
+                self.delta_layers_visited += 1;
+            }
+        }
+    }
+
+    pub(crate) fn get_delta_layers_visited(&self) -> u32 {
+        self.delta_layers_visited
     }
 
     pub(crate) fn get_layers_visited(&self) -> u32 {
@@ -171,6 +190,16 @@ impl ValuesReconstructState {
         }
     }
 
+    /// On hitting image layer, we can mark all keys in this range as done, because
+    /// if the image layer does not contain a key, it is deleted/never added.
+    pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
+        let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
+        assert_eq!(
+            prev_val, None,
+            "should consume the keyspace before the next iteration"
+        );
+    }
+
     /// Update the state collected for a given key.
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
@@ -233,8 +262,12 @@ impl ValuesReconstructState {
 
     /// Returns the key space describing the keys that have
     /// been marked as completed since the last call to this function.
-    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
-        self.keys_done.consume_keyspace()
+    /// Returns individual keys done, and the image layer coverage.
+    pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
+        (
+            self.keys_done.consume_keyspace(),
+            self.keys_with_image_coverage.take(),
+        )
     }
 }
 
@@ -646,8 +679,8 @@ pub mod tests {
 
     use super::*;
 
-    impl From<DeltaFileName> for PersistentLayerDesc {
-        fn from(value: DeltaFileName) -> Self {
+    impl From<DeltaLayerName> for PersistentLayerDesc {
+        fn from(value: DeltaLayerName) -> Self {
             PersistentLayerDesc::new_delta(
                 TenantShardId::from([0; 18]),
                 TimelineId::from_array([0; 16]),
@@ -658,8 +691,8 @@ pub mod tests {
         }
     }
 
-    impl From<ImageFileName> for PersistentLayerDesc {
-        fn from(value: ImageFileName) -> Self {
+    impl From<ImageLayerName> for PersistentLayerDesc {
+        fn from(value: ImageLayerName) -> Self {
             PersistentLayerDesc::new_img(
                 TenantShardId::from([0; 18]),
                 TimelineId::from_array([0; 16]),
@@ -670,11 +703,11 @@ pub mod tests {
         }
     }
 
-    impl From<LayerFileName> for PersistentLayerDesc {
-        fn from(value: LayerFileName) -> Self {
+    impl From<LayerName> for PersistentLayerDesc {
+        fn from(value: LayerName) -> Self {
             match value {
-                LayerFileName::Delta(d) => Self::from(d),
-                LayerFileName::Image(i) => Self::from(i),
+                LayerName::Delta(d) => Self::from(d),
+                LayerName::Image(i) => Self::from(i),
             }
         }
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6fd96b0e2f..1b3802840f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -57,6 +57,7 @@ use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
+use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;
@@ -68,7 +69,8 @@ use utils::{
 };
 
 use super::{
-    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer,
+    ValuesReconstructState,
 };
 
 ///
@@ -309,13 +311,13 @@ impl DeltaLayer {
             .and_then(|res| res)?;
 
         // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
+        let expected_layer_name = self.layer_desc().layer_name();
 
-        if actual_filename != expected_filename {
+        if actual_layer_name != expected_layer_name {
             println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+            println!("actual: {:?}", actual_layer_name.to_string());
+            println!("expected: {:?}", expected_layer_name.to_string());
         }
 
         Ok(Arc::new(loaded))
@@ -392,6 +394,7 @@ impl DeltaLayerWriterInner {
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename. We don't know
         // the end key yet, so we cannot form the final filename yet. We will
@@ -402,7 +405,7 @@ impl DeltaLayerWriterInner {
         let path =
             DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
 
-        let mut file = VirtualFile::create(&path).await?;
+        let mut file = VirtualFile::create(&path, ctx).await?;
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
         let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
@@ -584,6 +587,7 @@ impl DeltaLayerWriter {
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             inner: Some(
@@ -593,6 +597,7 @@ impl DeltaLayerWriter {
                     tenant_shard_id,
                     key_start,
                     lsn_range,
+                    ctx,
                 )
                 .await?,
             ),
@@ -699,6 +704,7 @@ impl DeltaLayer {
         let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -732,7 +738,7 @@ impl DeltaLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
@@ -906,7 +912,7 @@ impl DeltaLayerInner {
         .await
         .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
         reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
@@ -1010,6 +1016,7 @@ impl DeltaLayerInner {
         &self,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
     ) {
         let vectored_blob_reader = VectoredBlobReader::new(&self.file);
         let mut ignore_key_with_err = None;
@@ -1027,7 +1034,7 @@ impl DeltaLayerInner {
         // track when a key is done.
         for read in reads.into_iter().rev() {
             let res = vectored_blob_reader
-                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx)
                 .await;
 
             let blobs_buf = match res {
@@ -1272,7 +1279,7 @@ impl DeltaLayerInner {
 
                 buf.clear();
                 buf.reserve(read.size());
-                let res = reader.read_blobs(&read, buf).await?;
+                let res = reader.read_blobs(&read, buf, ctx).await?;
 
                 for blob in res.blobs {
                     let key = blob.meta.key;
@@ -1789,6 +1796,7 @@ mod test {
             harness.tenant_shard_id,
             entries_meta.key_range.start,
             entries_meta.lsn_range.clone(),
+            &ctx,
         )
         .await?;
 
@@ -1846,7 +1854,7 @@ mod test {
 
             for read in vectored_reads {
                 let blobs_buf = vectored_blob_reader
-                    .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                    .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx)
                     .await?;
                 for meta in blobs_buf.blobs.iter() {
                     let value = &blobs_buf.buf[meta.start..meta.end];
@@ -1976,6 +1984,7 @@ mod test {
                 tenant.tenant_shard_id,
                 Key::MIN,
                 Lsn(0x11)..truncate_at,
+                ctx,
             )
             .await
             .unwrap();
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 1477a1fc33..becd1e7a6d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -54,6 +54,7 @@ use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
+use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
@@ -65,8 +66,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};
+use super::layer_name::ImageLayerName;
+use super::{
+    AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
+};
 
 ///
 /// Header stored in the beginning of the file
@@ -155,6 +158,7 @@ pub struct ImageLayerInner {
     index_start_blk: u32,
     index_root_blk: u32,
 
+    key_range: Range<Key>,
     lsn: Lsn,
 
     file: VirtualFile,
@@ -231,7 +235,7 @@ impl ImageLayer {
         conf: &PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        fname: &ImageFileName,
+        fname: &ImageLayerName,
     ) -> Utf8PathBuf {
         let rand_string: String = rand::thread_rng()
             .sample_iter(&Alphanumeric)
@@ -267,13 +271,13 @@ impl ImageLayer {
             .and_then(|res| res)?;
 
         // not production code
-        let actual_filename = path.file_name().unwrap().to_owned();
-        let expected_filename = self.layer_desc().filename().file_name();
+        let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap();
+        let expected_layer_name = self.layer_desc().layer_name();
 
-        if actual_filename != expected_filename {
+        if actual_layer_name != expected_layer_name {
             println!("warning: filename does not match what is expected from in-file summary");
-            println!("actual: {:?}", actual_filename);
-            println!("expected: {:?}", expected_filename);
+            println!("actual: {:?}", actual_layer_name.to_string());
+            println!("expected: {:?}", expected_layer_name.to_string());
         }
 
         Ok(loaded)
@@ -340,6 +344,7 @@ impl ImageLayer {
         let mut file = VirtualFile::open_with_options(
             path,
             virtual_file::OpenOptions::new().read(true).write(true),
+            ctx,
         )
         .await
         .with_context(|| format!("Failed to open file '{}'", path))?;
@@ -374,7 +379,7 @@ impl ImageLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = match VirtualFile::open(path).await {
+        let file = match VirtualFile::open(path, ctx).await {
             Ok(file) => file,
             Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
         };
@@ -415,6 +420,7 @@ impl ImageLayerInner {
             file,
             file_id,
             max_vectored_read_bytes,
+            key_range: actual_summary.key_range,
         }))
     }
 
@@ -471,9 +477,11 @@ impl ImageLayerInner {
             .await
             .map_err(GetVectoredError::Other)?;
 
-        self.do_reads_and_update_state(reads, reconstruct_state)
+        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
             .await;
 
+        reconstruct_state.on_image_layer_visited(&self.key_range);
+
         Ok(())
     }
 
@@ -534,6 +542,7 @@ impl ImageLayerInner {
         &self,
         reads: Vec<VectoredRead>,
         reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
     ) {
         let max_vectored_read_bytes = self
             .max_vectored_read_bytes
@@ -562,7 +571,7 @@ impl ImageLayerInner {
             }
 
             let buf = BytesMut::with_capacity(buf_size);
-            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+            let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
 
             match res {
                 Ok(blobs_buf) => {
@@ -628,6 +637,7 @@ impl ImageLayerWriterInner {
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename.
         // We'll atomically rename it to the final name when we're done.
@@ -635,7 +645,7 @@ impl ImageLayerWriterInner {
             conf,
             timeline_id,
             tenant_shard_id,
-            &ImageFileName {
+            &ImageLayerName {
                 key_range: key_range.clone(),
                 lsn,
             },
@@ -647,6 +657,7 @@ impl ImageLayerWriterInner {
                 virtual_file::OpenOptions::new()
                     .write(true)
                     .create_new(true),
+                ctx,
             )
             .await?
         };
@@ -801,10 +812,11 @@ impl ImageLayerWriter {
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<ImageLayerWriter> {
         Ok(Self {
             inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
                     .await?,
             ),
         })
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 4dacbec2f3..9553f83026 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -473,10 +473,11 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
@@ -642,6 +643,7 @@ impl InMemoryLayer {
             self.tenant_shard_id,
             Key::MIN,
             self.start_lsn..end_lsn,
+            ctx,
         )
         .await?;
 
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index b5e69db7f4..8c64621710 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
-use utils::sync::heavier_once_cell;
+use utils::sync::{gate, heavier_once_cell};
 
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -25,7 +25,7 @@ use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
 use super::{
-    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
+    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
     ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };
 
@@ -128,19 +128,17 @@ pub(crate) fn local_layer_path(
     conf: &PageServerConf,
     tenant_shard_id: &TenantShardId,
     timeline_id: &TimelineId,
-    layer_file_name: &LayerFileName,
-    _generation: &Generation,
+    layer_file_name: &LayerName,
+    generation: &Generation,
 ) -> Utf8PathBuf {
     let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);
 
-    timeline_path.join(layer_file_name.file_name())
-
-    // TOOD: include generation in the name in now+1 releases.
-    // timeline_path.join(format!(
-    //     "{}{}",
-    //     layer_file_name.file_name(),
-    //     generation.get_suffix()
-    // ))
+    if generation.is_none() {
+        // Without a generation, we may only use legacy path style
+        timeline_path.join(layer_file_name.to_string())
+    } else {
+        timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
+    }
 }
 
 impl Layer {
@@ -148,7 +146,7 @@ impl Layer {
     pub(crate) fn for_evicted(
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
-        file_name: LayerFileName,
+        file_name: LayerName,
         metadata: LayerFileMetadata,
     ) -> Self {
         let local_path = local_layer_path(
@@ -189,7 +187,7 @@ impl Layer {
         conf: &'static PageServerConf,
         timeline: &Arc<Timeline>,
         local_path: Utf8PathBuf,
-        file_name: LayerFileName,
+        file_name: LayerName,
         metadata: LayerFileMetadata,
     ) -> ResidentLayer {
         let desc = PersistentLayerDesc::from_filename(
@@ -261,7 +259,7 @@ impl Layer {
                 conf,
                 &timeline.tenant_shard_id,
                 &timeline.timeline_id,
-                &desc.filename(),
+                &desc.layer_name(),
                 &timeline.generation,
             );
 
@@ -587,9 +585,6 @@ struct LayerInner {
     /// [`Timeline::gate`] at the same time.
     timeline: Weak<Timeline>,
 
-    /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
-    have_remote_client: bool,
-
     access_stats: LayerAccessStats,
 
     /// This custom OnceCell is backed by std mutex, but only held for short time periods.
@@ -689,7 +684,7 @@ impl Drop for LayerInner {
         let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
 
         let path = std::mem::take(&mut self.path);
-        let file_name = self.layer_desc().filename();
+        let file_name = self.layer_desc().layer_name();
         let file_size = self.layer_desc().file_size;
         let timeline = self.timeline.clone();
         let meta = self.metadata();
@@ -734,23 +729,23 @@ impl Drop for LayerInner {
             if removed {
                 timeline.metrics.resident_physical_size_sub(file_size);
             }
-            if let Some(remote_client) = timeline.remote_client.as_ref() {
-                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            let res = timeline
+                .remote_client
+                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);
 
-                if let Err(e) = res {
-                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                    // demonstrating this deadlock (without spawn_blocking): stop will drop
-                    // queued items, which will have ResidentLayer's, and those drops would try
-                    // to re-entrantly lock the RemoteTimelineClient inner state.
-                    if !timeline.is_active() {
-                        tracing::info!("scheduling deletion on drop failed: {e:#}");
-                    } else {
-                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                    }
-                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            if let Err(e) = res {
+                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                // demonstrating this deadlock (without spawn_blocking): stop will drop
+                // queued items, which will have ResidentLayer's, and those drops would try
+                // to re-entrantly lock the RemoteTimelineClient inner state.
+                if !timeline.is_active() {
+                    tracing::info!("scheduling deletion on drop failed: {e:#}");
                 } else {
-                    LAYER_IMPL_METRICS.inc_completed_deletes();
+                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
                 }
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            } else {
+                LAYER_IMPL_METRICS.inc_completed_deletes();
             }
         });
     }
@@ -782,11 +777,12 @@ impl LayerInner {
 
         LayerInner {
             conf,
-            debug_str: { format!("timelines/{}/{}", timeline.timeline_id, desc.filename()).into() },
+            debug_str: {
+                format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into()
+            },
             path: local_path,
             desc,
             timeline: Arc::downgrade(timeline),
-            have_remote_client: timeline.remote_client.is_some(),
             access_stats,
             wanted_deleted: AtomicBool::new(false),
             inner,
@@ -815,8 +811,6 @@ impl LayerInner {
     /// in a new attempt to evict OR join the previously started attempt.
     #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
     pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
-        assert!(self.have_remote_client);
-
         let mut rx = self.status.as_ref().unwrap().subscribe();
 
         {
@@ -973,10 +967,6 @@ impl LayerInner {
             return Err(DownloadError::NotFile(ft));
         }
 
-        if timeline.remote_client.as_ref().is_none() {
-            return Err(DownloadError::NoRemoteStorage);
-        }
-
         if let Some(ctx) = ctx {
             self.check_expected_download(ctx)?;
         }
@@ -1113,15 +1103,12 @@ impl LayerInner {
         permit: heavier_once_cell::InitPermit,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<DownloadedLayer>> {
-        let client = timeline
+        let result = timeline
             .remote_client
-            .as_ref()
-            .expect("checked before download_init_and_wait");
-
-        let result = client
             .download_layer_file(
-                &self.desc.filename(),
+                &self.desc.layer_name(),
                 &self.metadata(),
+                &self.path,
                 &timeline.cancel,
                 ctx,
             )
@@ -1257,7 +1244,7 @@ impl LayerInner {
     }
 
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
-        let layer_file_name = self.desc.filename().file_name();
+        let layer_name = self.desc.layer_name().to_string();
 
         let resident = self
             .inner
@@ -1271,18 +1258,19 @@ impl LayerInner {
             let lsn_range = &self.desc.lsn_range;
 
             HistoricLayerInfo::Delta {
-                layer_file_name,
+                layer_file_name: layer_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn_range.start,
                 lsn_end: lsn_range.end,
                 remote: !resident,
                 access_stats,
+                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
             }
         } else {
             let lsn = self.desc.image_layer_lsn();
 
             HistoricLayerInfo::Image {
-                layer_file_name,
+                layer_file_name: layer_name,
                 layer_file_size: self.desc.file_size,
                 lsn_start: lsn,
                 remote: !resident,
@@ -1293,20 +1281,10 @@ impl LayerInner {
 
     /// `DownloadedLayer` is being dropped, so it calls this method.
     fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
-        let can_evict = self.have_remote_client;
-
         // we cannot know without inspecting LayerInner::inner if we should evict or not, even
         // though here it is very likely
         let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);
 
-        if !can_evict {
-            // it would be nice to assert this case out, but we are in drop
-            span.in_scope(|| {
-                tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
-            });
-            return;
-        }
-
         // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
         // drop while the `self.inner` is being locked, leading to a deadlock.
 
@@ -1355,7 +1333,7 @@ impl LayerInner {
 
         is_good_to_continue(&rx.borrow_and_update())?;
 
-        let Ok(_gate) = timeline.gate.enter() else {
+        let Ok(gate) = timeline.gate.enter() else {
             return Err(EvictionCancelled::TimelineGone);
         };
 
@@ -1443,7 +1421,7 @@ impl LayerInner {
         Self::spawn_blocking(move || {
             let _span = span.entered();
 
-            let res = self.evict_blocking(&timeline, &permit);
+            let res = self.evict_blocking(&timeline, &gate, &permit);
 
             let waiters = self.inner.initializer_count();
 
@@ -1469,6 +1447,7 @@ impl LayerInner {
     fn evict_blocking(
         &self,
         timeline: &Timeline,
+        _gate: &gate::GateGuard,
         _permit: &heavier_once_cell::InitPermit,
     ) -> Result<(), EvictionCancelled> {
         // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
@@ -1578,8 +1557,6 @@ pub(crate) enum EvictionError {
 pub(crate) enum DownloadError {
     #[error("timeline has already shutdown")]
     TimelineShutdown,
-    #[error("no remote storage configured")]
-    NoRemoteStorage,
     #[error("context denies downloading")]
     ContextAndConfigReallyDeniesDownloads,
     #[error("downloading is really required but not allowed by this method")]
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 52f62faa8d..fa9142d5e9 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -145,7 +145,7 @@ async fn smoke_test() {
         .await
         .expect("the local layer file still exists");
 
-    let rtc = timeline.remote_client.as_ref().unwrap();
+    let rtc = &timeline.remote_client;
 
     {
         let layers = &[layer];
@@ -761,13 +761,7 @@ async fn eviction_cancellation_on_drop() {
     timeline.freeze_and_flush().await.unwrap();
 
     // wait for the upload to complete so our Arc::strong_count assertion holds
-    timeline
-        .remote_client
-        .as_ref()
-        .unwrap()
-        .wait_completion()
-        .await
-        .unwrap();
+    timeline.remote_client.wait_completion().await.unwrap();
 
     let (evicted_layer, not_evicted) = {
         let mut layers = {
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index c375923e81..a89b66e4a1 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn};
 
 use crate::repository::Key;
 
-use super::{DeltaFileName, ImageFileName, LayerFileName};
+use super::{DeltaLayerName, ImageLayerName, LayerName};
 
 use serde::{Deserialize, Serialize};
 
@@ -51,7 +51,7 @@ impl PersistentLayerDesc {
     }
 
     pub fn short_id(&self) -> impl Display {
-        self.filename()
+        self.layer_name()
     }
 
     #[cfg(test)]
@@ -103,14 +103,14 @@ impl PersistentLayerDesc {
     pub fn from_filename(
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
-        filename: LayerFileName,
+        filename: LayerName,
         file_size: u64,
     ) -> Self {
         match filename {
-            LayerFileName::Image(i) => {
+            LayerName::Image(i) => {
                 Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
             }
-            LayerFileName::Delta(d) => Self::new_delta(
+            LayerName::Delta(d) => Self::new_delta(
                 tenant_shard_id,
                 timeline_id,
                 d.key_range,
@@ -132,34 +132,34 @@ impl PersistentLayerDesc {
         lsn..(lsn + 1)
     }
 
-    /// Get a delta file name for this layer.
+    /// Get a delta layer name for this layer.
     ///
     /// Panic: if this is not a delta layer.
-    pub fn delta_file_name(&self) -> DeltaFileName {
+    pub fn delta_layer_name(&self) -> DeltaLayerName {
         assert!(self.is_delta);
-        DeltaFileName {
+        DeltaLayerName {
             key_range: self.key_range.clone(),
             lsn_range: self.lsn_range.clone(),
         }
     }
 
-    /// Get a delta file name for this layer.
+    /// Get a image layer name for this layer.
     ///
     /// Panic: if this is not an image layer, or the lsn range is invalid
-    pub fn image_file_name(&self) -> ImageFileName {
+    pub fn image_layer_name(&self) -> ImageLayerName {
         assert!(!self.is_delta);
         assert!(self.lsn_range.start + 1 == self.lsn_range.end);
-        ImageFileName {
+        ImageLayerName {
             key_range: self.key_range.clone(),
             lsn: self.lsn_range.start,
         }
     }
 
-    pub fn filename(&self) -> LayerFileName {
+    pub fn layer_name(&self) -> LayerName {
         if self.is_delta {
-            self.delta_file_name().into()
+            self.delta_layer_name().into()
         } else {
-            self.image_file_name().into()
+            self.image_layer_name().into()
         }
     }
 
diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
similarity index 72%
rename from pageserver/src/tenant/storage_layer/filename.rs
rename to pageserver/src/tenant/storage_layer/layer_name.rs
index fff66a9d07..da26e1eeb7 100644
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -15,29 +15,29 @@ use super::PersistentLayerDesc;
 
 // Note: Timeline::load_layer_map() relies on this sort order
 #[derive(PartialEq, Eq, Clone, Hash)]
-pub struct DeltaFileName {
+pub struct DeltaLayerName {
     pub key_range: Range<Key>,
     pub lsn_range: Range<Lsn>,
 }
 
-impl std::fmt::Debug for DeltaFileName {
+impl std::fmt::Debug for DeltaLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         use super::RangeDisplayDebug;
 
-        f.debug_struct("DeltaFileName")
+        f.debug_struct("DeltaLayerName")
             .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("lsn_range", &self.lsn_range)
             .finish()
     }
 }
 
-impl PartialOrd for DeltaFileName {
+impl PartialOrd for DeltaLayerName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for DeltaFileName {
+impl Ord for DeltaLayerName {
     fn cmp(&self, other: &Self) -> Ordering {
         let mut cmp = self.key_range.start.cmp(&other.key_range.start);
         if cmp != Ordering::Equal {
@@ -57,16 +57,14 @@ impl Ord for DeltaFileName {
     }
 }
 
-/// Represents the filename of a DeltaLayer
+/// Represents the region of the LSN-Key space covered by a DeltaLayer
 ///
 /// ```text
 ///    <key start>-<key end>__<LSN start>-<LSN end>
 /// ```
-impl DeltaFileName {
-    ///
-    /// Parse a string as a delta file name. Returns None if the filename does not
-    /// match the expected pattern.
-    ///
+impl DeltaLayerName {
+    /// Parse the part of a delta layer's file name that represents the LayerName. Returns None
+    /// if the filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
         let mut parts = fname.split("__");
         let mut key_parts = parts.next()?.split('-');
@@ -105,14 +103,14 @@ impl DeltaFileName {
             // or panic?
         }
 
-        Some(DeltaFileName {
+        Some(DeltaLayerName {
             key_range: key_start..key_end,
             lsn_range: start_lsn..end_lsn,
         })
     }
 }
 
-impl fmt::Display for DeltaFileName {
+impl fmt::Display for DeltaLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
             f,
@@ -126,29 +124,29 @@ impl fmt::Display for DeltaFileName {
 }
 
 #[derive(PartialEq, Eq, Clone, Hash)]
-pub struct ImageFileName {
+pub struct ImageLayerName {
     pub key_range: Range<Key>,
     pub lsn: Lsn,
 }
 
-impl std::fmt::Debug for ImageFileName {
+impl std::fmt::Debug for ImageLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         use super::RangeDisplayDebug;
 
-        f.debug_struct("ImageFileName")
+        f.debug_struct("ImageLayerName")
             .field("key_range", &RangeDisplayDebug(&self.key_range))
             .field("lsn", &self.lsn)
             .finish()
     }
 }
 
-impl PartialOrd for ImageFileName {
+impl PartialOrd for ImageLayerName {
     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl Ord for ImageFileName {
+impl Ord for ImageLayerName {
     fn cmp(&self, other: &Self) -> Ordering {
         let mut cmp = self.key_range.start.cmp(&other.key_range.start);
         if cmp != Ordering::Equal {
@@ -164,7 +162,7 @@ impl Ord for ImageFileName {
     }
 }
 
-impl ImageFileName {
+impl ImageLayerName {
     pub fn lsn_as_range(&self) -> Range<Lsn> {
         // Saves from having to copypaste this all over
         PersistentLayerDesc::image_layer_lsn_range(self.lsn)
@@ -172,16 +170,14 @@ impl ImageFileName {
 }
 
 ///
-/// Represents the filename of an ImageLayer
+/// Represents the part of the Key-LSN space covered by an ImageLayer
 ///
 /// ```text
 ///    <key start>-<key end>__<LSN>
 /// ```
-impl ImageFileName {
-    ///
-    /// Parse a string as an image file name. Returns None if the filename does not
-    /// match the expected pattern.
-    ///
+impl ImageLayerName {
+    /// Parse a string as then LayerName part of an image layer file name. Returns None if the
+    /// filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
         let mut parts = fname.split("__");
         let mut key_parts = parts.next()?.split('-');
@@ -202,14 +198,14 @@ impl ImageFileName {
 
         let lsn = Lsn::from_hex(lsn_str).ok()?;
 
-        Some(ImageFileName {
+        Some(ImageLayerName {
             key_range: key_start..key_end,
             lsn,
         })
     }
 }
 
-impl fmt::Display for ImageFileName {
+impl fmt::Display for ImageLayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
             f,
@@ -220,21 +216,24 @@ impl fmt::Display for ImageFileName {
         )
     }
 }
+
+/// LayerName is the logical identity of a layer within a LayerMap at a moment in time.  The
+/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations
+/// over time (e.g. across shard splits or compression). The physical filenames of layers in local
+/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
+/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
+/// and [`crate::tenant::storage_layer::layer::local_layer_path`])
 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
-pub enum LayerFileName {
-    Image(ImageFileName),
-    Delta(DeltaFileName),
+pub enum LayerName {
+    Image(ImageLayerName),
+    Delta(DeltaLayerName),
 }
 
-impl LayerFileName {
-    pub fn file_name(&self) -> String {
-        self.to_string()
-    }
-
+impl LayerName {
     /// Determines if this layer file is considered to be in future meaning we will discard these
     /// layers during timeline initialization from the given disk_consistent_lsn.
     pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool {
-        use LayerFileName::*;
+        use LayerName::*;
         match self {
             Image(file_name) if file_name.lsn > disk_consistent_lsn => true,
             Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true,
@@ -243,7 +242,7 @@ impl LayerFileName {
     }
 
     pub(crate) fn kind(&self) -> &'static str {
-        use LayerFileName::*;
+        use LayerName::*;
         match self {
             Delta(_) => "delta",
             Image(_) => "image",
@@ -251,7 +250,7 @@ impl LayerFileName {
     }
 }
 
-impl fmt::Display for LayerFileName {
+impl fmt::Display for LayerName {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             Self::Image(fname) => write!(f, "{fname}"),
@@ -260,25 +259,25 @@ impl fmt::Display for LayerFileName {
     }
 }
 
-impl From<ImageFileName> for LayerFileName {
-    fn from(fname: ImageFileName) -> Self {
+impl From<ImageLayerName> for LayerName {
+    fn from(fname: ImageLayerName) -> Self {
         Self::Image(fname)
     }
 }
-impl From<DeltaFileName> for LayerFileName {
-    fn from(fname: DeltaFileName) -> Self {
+impl From<DeltaLayerName> for LayerName {
+    fn from(fname: DeltaLayerName) -> Self {
         Self::Delta(fname)
     }
 }
 
-impl FromStr for LayerFileName {
+impl FromStr for LayerName {
     type Err = String;
 
     /// Conversion from either a physical layer filename, or the string-ization of
     /// Self. When loading a physical layer filename, we drop any extra information
     /// not needed to build Self.
     fn from_str(value: &str) -> Result<Self, Self::Err> {
-        let gen_suffix_regex = Regex::new("^(?<base>.+)-(?<gen>[0-9a-f]{8})$").unwrap();
+        let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
         let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
             Some(captures) => captures
                 .name("base")
@@ -288,8 +287,8 @@ impl FromStr for LayerFileName {
             None => value.into(),
         };
 
-        let delta = DeltaFileName::parse_str(&file_name);
-        let image = ImageFileName::parse_str(&file_name);
+        let delta = DeltaLayerName::parse_str(&file_name);
+        let image = ImageLayerName::parse_str(&file_name);
         let ok = match (delta, image) {
             (None, None) => {
                 return Err(format!(
@@ -304,7 +303,7 @@ impl FromStr for LayerFileName {
     }
 }
 
-impl serde::Serialize for LayerFileName {
+impl serde::Serialize for LayerName {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
@@ -316,19 +315,19 @@ impl serde::Serialize for LayerFileName {
     }
 }
 
-impl<'de> serde::Deserialize<'de> for LayerFileName {
+impl<'de> serde::Deserialize<'de> for LayerName {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
-        deserializer.deserialize_string(LayerFileNameVisitor)
+        deserializer.deserialize_string(LayerNameVisitor)
     }
 }
 
-struct LayerFileNameVisitor;
+struct LayerNameVisitor;
 
-impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
-    type Value = LayerFileName;
+impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
+    type Value = LayerName;
 
     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
         write!(
@@ -348,37 +347,33 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor {
 mod test {
     use super::*;
     #[test]
-    fn image_layer_parse() -> anyhow::Result<()> {
-        let expected = LayerFileName::Image(ImageFileName {
+    fn image_layer_parse() {
+        let expected = LayerName::Image(ImageLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn: Lsn::from_hex("00000000014FED58").unwrap(),
         });
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
         assert_eq!(parsed, expected,);
 
         // Omitting generation suffix is valid
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
         assert_eq!(parsed, expected,);
-
-        Ok(())
     }
 
     #[test]
-    fn delta_layer_parse() -> anyhow::Result<()> {
-        let expected = LayerFileName::Delta(DeltaFileName {
+    fn delta_layer_parse() {
+        let expected = LayerName::Delta(DeltaLayerName {
             key_range: Key::from_i128(0)
                 ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
             lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
                 ..Lsn::from_hex("000000000154C481").unwrap(),
         });
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
         assert_eq!(parsed, expected);
 
         // Omitting generation suffix is valid
-        let parsed = LayerFileName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
         assert_eq!(parsed, expected);
-
-        Ok(())
     }
 }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index f153719f98..ba2b8afd03 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -41,7 +41,7 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore
         tokio::sync::Semaphore::new(permits)
     });
 
-#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr, enum_map::Enum)]
 #[strum(serialize_all = "snake_case")]
 pub(crate) enum BackgroundLoopKind {
     Compaction,
@@ -57,19 +57,25 @@ pub(crate) enum BackgroundLoopKind {
 
 impl BackgroundLoopKind {
     fn as_static_str(&self) -> &'static str {
-        let s: &'static str = self.into();
-        s
+        self.into()
     }
 }
 
+static PERMIT_GAUGES: once_cell::sync::Lazy<
+    enum_map::EnumMap<BackgroundLoopKind, metrics::IntCounterPair>,
+> = once_cell::sync::Lazy::new(|| {
+    enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+        let kind = <BackgroundLoopKind as enum_map::Enum>::from_usize(i);
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()])
+    }))
+});
+
 /// Cancellation safe.
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
     loop_kind: BackgroundLoopKind,
     _ctx: &RequestContext,
 ) -> tokio::sync::SemaphorePermit<'static> {
-    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
-        .with_label_values(&[loop_kind.as_static_str()])
-        .guard();
+    let _guard = PERMIT_GAUGES[loop_kind].guard();
 
     pausable_failpoint!(
         "initial-size-calculation-permit-pause",
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6d012c70c..881e7f8f3c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,14 +18,14 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
     },
-    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
+    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
-        AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
-        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        TimelineState,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -60,10 +60,12 @@ use std::{
     ops::ControlFlow,
 };
 
-use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::{
-    layer_map::{LayerMap, SearchResult},
-    metadata::TimelineMetadata,
+use crate::{
+    aux_file::AuxFileSizeEstimator,
+    tenant::{
+        layer_map::{LayerMap, SearchResult},
+        metadata::TimelineMetadata,
+    },
 };
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -75,7 +77,7 @@ use crate::{
     disk_usage_eviction_task::finite_f32,
     tenant::storage_layer::{
         AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-        LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
+        LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult,
         ValueReconstructState, ValuesReconstructState,
     },
 };
@@ -86,6 +88,9 @@ use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
+};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -197,7 +202,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
-    pub remote_client: Option<RemoteTimelineClient>,
+    pub remote_client: RemoteTimelineClient,
     pub deletion_queue_client: DeletionQueueClient,
     pub timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
@@ -264,12 +269,14 @@ pub struct Timeline {
     // Atomic would be more appropriate here.
     last_freeze_ts: RwLock<Instant>,
 
+    pub(crate) standby_horizon: AtomicLsn,
+
     // WAL redo manager. `None` only for broken tenants.
     walredo_mgr: Option<Arc<super::WalRedoManager>>,
 
     /// Remote storage client.
     /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
-    pub remote_client: Option<Arc<RemoteTimelineClient>>,
+    pub remote_client: Arc<RemoteTimelineClient>,
 
     // What page versions do we hold in the repository? If we get a
     // request > last_record_lsn, we need to wait until we receive all
@@ -343,8 +350,8 @@ pub struct Timeline {
     // though let's keep them both for better error visibility.
     pub initdb_lsn: Lsn,
 
-    /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    /// When did we last calculate the partitioning? Make it pub to test cases.
+    pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -409,6 +416,12 @@ pub struct Timeline {
 
     /// Keep aux directory cache to avoid it's reconstruction on each update
     pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
+
+    /// Size estimator for aux file v2
+    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
+
+    /// Indicate whether aux file v2 storage is enabled.
+    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
 }
 
 pub struct WalReceiverInfo {
@@ -472,6 +485,11 @@ impl GcCutoffs {
     }
 }
 
+pub(crate) struct TimelineVisitOutcome {
+    completed_keyspace: KeySpace,
+    image_covered_keyspace: KeySpace,
+}
+
 /// An error happened in a get() operation.
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum PageReconstructError {
@@ -496,6 +514,13 @@ pub(crate) enum PageReconstructError {
     MissingKey(MissingKeyError),
 }
 
+impl GetVectoredError {
+    #[cfg(test)]
+    pub(crate) fn is_missing_key_error(&self) -> bool {
+        matches!(self, Self::MissingKey(_))
+    }
+}
+
 #[derive(Debug)]
 pub struct MissingKeyError {
     key: Key,
@@ -773,6 +798,11 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
+struct ImageLayerCreationOutcome {
+    image: Option<ResidentLayer>,
+    next_start_key: Key,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -874,7 +904,7 @@ impl Timeline {
                 }
 
                 let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
+                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                     .await;
 
                 if self.conf.validate_vectored_get {
@@ -1019,7 +1049,12 @@ impl Timeline {
             }
             GetVectoredImpl::Vectored => {
                 let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
+                    .get_vectored_impl(
+                        keyspace.clone(),
+                        lsn,
+                        &mut ValuesReconstructState::new(),
+                        ctx,
+                    )
                     .await;
 
                 if self.conf.validate_vectored_get {
@@ -1107,7 +1142,7 @@ impl Timeline {
             .get_vectored_impl(
                 keyspace.clone(),
                 lsn,
-                ValuesReconstructState::default(),
+                &mut ValuesReconstructState::default(),
                 ctx,
             )
             .await;
@@ -1184,7 +1219,7 @@ impl Timeline {
         &self,
         keyspace: KeySpace,
         lsn: Lsn,
-        mut reconstruct_state: ValuesReconstructState,
+        reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let get_kind = if keyspace.total_raw_size() == 1 {
@@ -1196,7 +1231,7 @@ impl Timeline {
         let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(get_kind)
             .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
             .await?;
         get_data_timer.stop_and_record();
 
@@ -1205,7 +1240,8 @@ impl Timeline {
             .start_timer();
         let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
         let layers_visited = reconstruct_state.get_layers_visited();
-        for (key, res) in reconstruct_state.keys {
+
+        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
             match res {
                 Err(err) => {
                     results.insert(key, Err(err));
@@ -1220,11 +1256,17 @@ impl Timeline {
         }
         reconstruct_timer.stop_and_record();
 
-        // Note that this is an approximation. Tracking the exact number of layers visited
-        // per key requires virtually unbounded memory usage and is inefficient
-        // (i.e. segment tree tracking each range queried from a layer)
-        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
-            .observe(layers_visited as f64 / results.len() as f64);
+        // For aux file keys (v1 or v2) the vectored read path does not return an error
+        // when they're missing. Instead they are omitted from the resulting btree
+        // (this is a requirement, not a bug). Skip updating the metric in these cases
+        // to avoid infinite results.
+        if !results.is_empty() {
+            // Note that this is an approximation. Tracking the exact number of layers visited
+            // per key requires virtually unbounded memory usage and is inefficient
+            // (i.e. segment tree tracking each range queried from a layer)
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+                .observe(layers_visited as f64 / results.len() as f64);
+        }
 
         Ok(results)
     }
@@ -1364,22 +1406,14 @@ impl Timeline {
     /// not validated with control plane yet.
     /// See [`Self::get_remote_consistent_lsn_visible`].
     pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_projected()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_projected()
     }
 
     /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
     /// i.e. a value of remote_consistent_lsn_projected which has undergone
     /// generation validation in the deletion queue.
     pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.remote_consistent_lsn_visible()
-        } else {
-            None
-        }
+        self.remote_client.remote_consistent_lsn_visible()
     }
 
     /// The sum of the file size of all historic layers in the layer map.
@@ -1498,6 +1532,20 @@ impl Timeline {
         Ok(())
     }
 
+    /// Obtains a temporary lease blocking garbage collection for the given LSN
+    pub(crate) fn make_lsn_lease(
+        &self,
+        _lsn: Lsn,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<LsnLease> {
+        const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
+        let lease = LsnLease {
+            valid_until: SystemTime::now() + LEASE_LENGTH,
+        };
+        // TODO: dummy implementation
+        Ok(lease)
+    }
+
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
@@ -1652,7 +1700,7 @@ impl Timeline {
             return Ok(());
         }
 
-        match self.get_compaction_algorithm() {
+        match self.get_compaction_algorithm_settings().kind {
             CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
             CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
         }
@@ -1749,16 +1797,14 @@ impl Timeline {
             match self.freeze_and_flush().await {
                 Ok(_) => {
                     // drain the upload queue
-                    if let Some(client) = self.remote_client.as_ref() {
-                        // if we did not wait for completion here, it might be our shutdown process
-                        // didn't wait for remote uploads to complete at all, as new tasks can forever
-                        // be spawned.
-                        //
-                        // what is problematic is the shutting down of RemoteTimelineClient, because
-                        // obviously it does not make sense to stop while we wait for it, but what
-                        // about corner cases like s3 suddenly hanging up?
-                        client.shutdown().await;
-                    }
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    self.remote_client.shutdown().await;
                 }
                 Err(e) => {
                     // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
@@ -1774,18 +1820,16 @@ impl Timeline {
 
         // Transition the remote_client into a state where it's only useful for timeline deletion.
         // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.stop();
-            // As documented in remote_client.stop()'s doc comment, it's our responsibility
-            // to shut down the upload queue tasks.
-            // TODO: fix that, task management should be encapsulated inside remote_client.
-            task_mgr::shutdown_tasks(
-                Some(TaskKind::RemoteUploadTask),
-                Some(self.tenant_shard_id),
-                Some(self.timeline_id),
-            )
-            .await;
-        }
+        self.remote_client.stop();
+        // As documented in remote_client.stop()'s doc comment, it's our responsibility
+        // to shut down the upload queue tasks.
+        // TODO: fix that, task management should be encapsulated inside remote_client.
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::RemoteUploadTask),
+            Some(self.tenant_shard_id),
+            Some(self.timeline_id),
+        )
+        .await;
 
         // TODO: work toward making this a no-op. See this funciton's doc comment for more context.
         tracing::debug!("Waiting for tasks...");
@@ -1905,16 +1949,12 @@ impl Timeline {
     #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
     pub(crate) async fn download_layer(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
         let Some(layer) = self.find_layer(layer_file_name).await else {
             return Ok(None);
         };
 
-        if self.remote_client.is_none() {
-            return Ok(Some(false));
-        }
-
         layer.download().await?;
 
         Ok(Some(true))
@@ -1925,7 +1965,7 @@ impl Timeline {
     /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
     pub(crate) async fn evict_layer(
         &self,
-        layer_file_name: &LayerFileName,
+        layer_file_name: &LayerName,
     ) -> anyhow::Result<Option<bool>> {
         let _gate = self
             .gate
@@ -2056,12 +2096,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
-    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
+    fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
         let tenant_conf = &self.tenant_conf.load();
         tenant_conf
             .tenant_conf
             .compaction_algorithm
-            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
+            .as_ref()
+            .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
+            .clone()
     }
 
     fn get_eviction_policy(&self) -> EvictionPolicy {
@@ -2138,6 +2180,7 @@ impl Timeline {
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
+        aux_file_policy: Option<AuxFilePolicy>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2155,6 +2198,16 @@ impl Timeline {
         };
 
         Arc::new_cyclic(|myself| {
+            let metrics = TimelineMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                    "mtime",
+                    evictions_low_residence_duration_metric_threshold,
+                ),
+            );
+            let aux_file_metrics = metrics.aux_file_size_gauge.clone();
+
             let mut result = Timeline {
                 conf,
                 tenant_conf,
@@ -2169,7 +2222,7 @@ impl Timeline {
                 walredo_mgr,
                 walreceiver: Mutex::new(None),
 
-                remote_client: resources.remote_client.map(Arc::new),
+                remote_client: Arc::new(resources.remote_client),
 
                 // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                 last_record_lsn: SeqWait::new(RecordLsn {
@@ -2186,14 +2239,7 @@ impl Timeline {
                 ancestor_timeline: ancestor,
                 ancestor_lsn: metadata.ancestor_lsn(),
 
-                metrics: TimelineMetrics::new(
-                    &tenant_shard_id,
-                    &timeline_id,
-                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
-                        "mtime",
-                        evictions_low_residence_duration_metric_threshold,
-                    ),
-                ),
+                metrics,
 
                 query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
                     &tenant_shard_id,
@@ -2251,12 +2297,18 @@ impl Timeline {
                 compaction_lock: tokio::sync::Mutex::default(),
                 gc_lock: tokio::sync::Mutex::default(),
 
+                standby_horizon: AtomicLsn::new(0),
+
                 timeline_get_throttle: resources.timeline_get_throttle,
 
                 aux_files: tokio::sync::Mutex::new(AuxFilesState {
                     dir: None,
                     n_deltas: 0,
                 }),
+
+                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
+
+                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2387,13 +2439,13 @@ impl Timeline {
         index_part: Option<IndexPart>,
     ) -> anyhow::Result<()> {
         use init::{Decision::*, Discovered, DismissedLayer};
-        use LayerFileName::*;
+        use LayerName::*;
 
         let mut guard = self.layers.write().await;
 
         let timer = self.metrics.load_layer_map_histo.start_timer();
 
-        // Scan timeline directory and create ImageFileName and DeltaFilename
+        // Scan timeline directory and create ImageLayerName and DeltaFilename
         // structs representing all files on disk
         let timeline_path = self
             .conf
@@ -2421,10 +2473,6 @@ impl Timeline {
                             discovered_layers.push((layer_file_name, local_path, file_size));
                             continue;
                         }
-                        Discovered::Metadata => {
-                            warn!("found legacy metadata file, these should have been removed in load_tenant_config");
-                            continue;
-                        }
                         Discovered::IgnoredBackup => {
                             continue;
                         }
@@ -2463,33 +2511,37 @@ impl Timeline {
                 let mut needs_cleanup = Vec::new();
                 let mut total_physical_size = 0;
 
-                for (name, local_path, decision) in decided {
+                for (name, decision) in decided {
                     let decision = match decision {
                         Ok(UseRemote { local, remote }) => {
                             // Remote is authoritative, but we may still choose to retain
                             // the local file if the contents appear to match
-                            if local.file_size() == remote.file_size() {
+                            if local.metadata.file_size() == remote.file_size() {
                                 // Use the local file, but take the remote metadata so that we pick up
                                 // the correct generation.
-                                UseLocal(remote)
+                                UseLocal(LocalLayerFileMetadata {
+                                    metadata: remote,
+                                    local_path: local.local_path,
+                                })
                             } else {
-                                let local_path = local_path.as_ref().expect("Locally found layer must have path");
-                                init::cleanup_local_file_for_remote(local_path, &local, &remote)?;
+                                init::cleanup_local_file_for_remote(&local, &remote)?;
                                 UseRemote { local, remote }
                             }
                         }
                         Ok(decision) => decision,
                         Err(DismissedLayer::Future { local }) => {
-                            if local.is_some() {
-                                let local_path = local_path.expect("Locally found layer must have path");
-                                init::cleanup_future_layer(&local_path, &name, disk_consistent_lsn)?;
+                            if let Some(local) = local {
+                                init::cleanup_future_layer(
+                                    &local.local_path,
+                                    &name,
+                                    disk_consistent_lsn,
+                                )?;
                             }
                             needs_cleanup.push(name);
                             continue;
                         }
                         Err(DismissedLayer::LocalOnly(local)) => {
-                            let local_path = local_path.expect("Locally found layer must have path");
-                            init::cleanup_local_only_file(&local_path, &name, &local)?;
+                            init::cleanup_local_only_file(&name, &local)?;
                             // this file never existed remotely, we will have to do rework
                             continue;
                         }
@@ -2503,20 +2555,10 @@ impl Timeline {
                     tracing::debug!(layer=%name, ?decision, "applied");
 
                     let layer = match decision {
-                        UseLocal(m) => {
-                            total_physical_size += m.file_size();
-
-                            let local_path = local_path.unwrap_or_else(|| {
-                                local_layer_path(
-                                    conf,
-                                    &this.tenant_shard_id,
-                                    &this.timeline_id,
-                                    &name,
-                                    &m.generation,
-                                )
-                            });
-
-                            Layer::for_resident(conf, &this, local_path, name, m).drop_eviction_guard()
+                        UseLocal(local) => {
+                            total_physical_size += local.metadata.file_size();
+                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                                .drop_eviction_guard()
                         }
                         Evicted(remote) | UseRemote { remote, .. } => {
                             Layer::for_evicted(conf, &this, name, remote)
@@ -2536,36 +2578,36 @@ impl Timeline {
 
         guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
 
-        if let Some(rtc) = self.remote_client.as_ref() {
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
-            rtc.schedule_index_upload_for_file_changes()?;
-            // This barrier orders above DELETEs before any later operations.
-            // This is critical because code executing after the barrier might
-            // create again objects with the same key that we just scheduled for deletion.
-            // For example, if we just scheduled deletion of an image layer "from the future",
-            // later compaction might run again and re-create the same image layer.
-            // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
-            // "same" here means same key range and LSN.
-            //
-            // Without a barrier between above DELETEs and the re-creation's PUTs,
-            // the upload queue may execute the PUT first, then the DELETE.
-            // In our example, we will end up with an IndexPart referencing a non-existent object.
-            //
-            // 1. a future image layer is created and uploaded
-            // 2. ps restart
-            // 3. the future layer from (1) is deleted during load layer map
-            // 4. image layer is re-created and uploaded
-            // 5. deletion queue would like to delete (1) but actually deletes (4)
-            // 6. delete by name works as expected, but it now deletes the wrong (later) version
-            //
-            // See https://github.com/neondatabase/neon/issues/5878
-            //
-            // NB: generation numbers naturally protect against this because they disambiguate
-            //     (1) and (4)
-            rtc.schedule_barrier()?;
-            // Tenant::create_timeline will wait for these uploads to happen before returning, or
-            // on retry.
-        }
+        self.remote_client
+            .schedule_layer_file_deletion(&needs_cleanup)?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
+        // This barrier orders above DELETEs before any later operations.
+        // This is critical because code executing after the barrier might
+        // create again objects with the same key that we just scheduled for deletion.
+        // For example, if we just scheduled deletion of an image layer "from the future",
+        // later compaction might run again and re-create the same image layer.
+        // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn.
+        // "same" here means same key range and LSN.
+        //
+        // Without a barrier between above DELETEs and the re-creation's PUTs,
+        // the upload queue may execute the PUT first, then the DELETE.
+        // In our example, we will end up with an IndexPart referencing a non-existent object.
+        //
+        // 1. a future image layer is created and uploaded
+        // 2. ps restart
+        // 3. the future layer from (1) is deleted during load layer map
+        // 4. image layer is re-created and uploaded
+        // 5. deletion queue would like to delete (1) but actually deletes (4)
+        // 6. delete by name works as expected, but it now deletes the wrong (later) version
+        //
+        // See https://github.com/neondatabase/neon/issues/5878
+        //
+        // NB: generation numbers naturally protect against this because they disambiguate
+        //     (1) and (4)
+        self.remote_client.schedule_barrier()?;
+        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // on retry.
 
         info!(
             "loaded layer map with {} layers at {}, total physical size: {}",
@@ -2624,6 +2666,7 @@ impl Timeline {
                             // Don't make noise.
                         } else {
                             warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
+                            debug_assert!(false);
                         }
                     }
                 };
@@ -2997,10 +3040,10 @@ impl Timeline {
         }
     }
 
-    async fn find_layer(&self, layer_name: &LayerFileName) -> Option<Layer> {
+    async fn find_layer(&self, layer_name: &LayerName) -> Option<Layer> {
         let guard = self.layers.read().await;
         for historic_layer in guard.layer_map().iter_historic_layers() {
-            let historic_layer_name = historic_layer.filename();
+            let historic_layer_name = historic_layer.layer_name();
             if layer_name == &historic_layer_name {
                 return Some(guard.get_from_desc(&historic_layer));
             }
@@ -3017,9 +3060,6 @@ impl Timeline {
     /// should treat this as a cue to simply skip doing any heatmap uploading
     /// for this timeline.
     pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
-        // no point in heatmaps without remote client
-        let _remote_client = self.remote_client.as_ref()?;
-
         if !self.is_active() {
             return None;
         }
@@ -3030,7 +3070,7 @@ impl Timeline {
             let last_activity_ts = layer.access_stats().latest_activity_or_now();
 
             HeatMapLayer::new(
-                layer.layer_desc().filename(),
+                layer.layer_desc().layer_name(),
                 (&layer.metadata()).into(),
                 last_activity_ts,
             )
@@ -3040,6 +3080,15 @@ impl Timeline {
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
     }
+
+    /// Returns true if the given lsn is or was an ancestor branchpoint.
+    pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool {
+        // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original
+        // branchpoint in the value in IndexPart::lineage
+        self.ancestor_lsn == lsn
+            || (self.ancestor_lsn == Lsn::INVALID
+                && self.remote_client.is_previous_ancestor_lsn(lsn))
+    }
 }
 
 type TraversalId = Arc<str>;
@@ -3177,7 +3226,7 @@ impl Timeline {
             if let Some(open_layer) = &layers.open_layer {
                 let start_lsn = open_layer.get_lsn_range().start;
                 if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display());
+                    //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display());
                     // Get all the data needed to reconstruct the page version from this layer.
                     // But if we have an older cached page image, no need to go past that.
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
@@ -3206,7 +3255,7 @@ impl Timeline {
             for frozen_layer in layers.frozen_layers.iter().rev() {
                 let start_lsn = frozen_layer.get_lsn_range().start;
                 if cont_lsn > start_lsn {
-                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
+                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display());
                     let lsn_floor = max(cached_lsn + 1, start_lsn);
 
                     let frozen_layer = frozen_layer.clone();
@@ -3283,12 +3332,15 @@ impl Timeline {
 
         let mut cont_lsn = Lsn(request_lsn.0 + 1);
 
-        loop {
+        let missing_keyspace = loop {
             if self.cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
-            let completed = Self::get_vectored_reconstruct_data_timeline(
+            let TimelineVisitOutcome {
+                completed_keyspace: completed,
+                image_covered_keyspace,
+            } = Self::get_vectored_reconstruct_data_timeline(
                 timeline,
                 keyspace.clone(),
                 cont_lsn,
@@ -3307,12 +3359,31 @@ impl Timeline {
                 ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
             });
 
-            // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
-            // into ancestor timelines). TODO: is there any other metadata which we want to inherit?
-            if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
-                break;
+            // Keyspace is fully retrieved
+            if keyspace.is_empty() {
+                break None;
             }
 
+            // Not fully retrieved but no ancestor timeline.
+            if timeline.ancestor_timeline.is_none() {
+                break Some(keyspace);
+            }
+
+            // Now we see if there are keys covered by the image layer but does not exist in the
+            // image layer, which means that the key does not exist.
+
+            // The block below will stop the vectored search if any of the keys encountered an image layer
+            // which did not contain a snapshot for said key. Since we have already removed all completed
+            // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
+            // space. If that's not the case, we had at least one key encounter a gap in the image layer
+            // and stop the search as a result of that.
+            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            if !removed.is_empty() {
+                break Some(removed);
+            }
+            // If we reached this point, `remove_overlapping_with` should not have made any change to the
+            // keyspace.
+
             // Take the min to avoid reconstructing a page with data newer than request Lsn.
             cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
             timeline_owned = timeline
@@ -3320,14 +3391,14 @@ impl Timeline {
                 .await
                 .map_err(GetVectoredError::GetReadyAncestorError)?;
             timeline = &*timeline_owned;
-        }
+        };
 
-        if keyspace.total_raw_size() != 0 {
+        if let Some(missing_keyspace) = missing_keyspace {
             return Err(GetVectoredError::MissingKey(MissingKeyError {
-                key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
+                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
                 shard: self
                     .shard_identity
-                    .get_shard_number(&keyspace.start().unwrap()),
+                    .get_shard_number(&missing_keyspace.start().unwrap()),
                 cont_lsn,
                 request_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
@@ -3352,6 +3423,9 @@ impl Timeline {
     ///
     /// At each iteration pop the top of the fringe (the layer with the highest Lsn)
     /// and get all the required reconstruct data from the layer in one go.
+    ///
+    /// Returns the completed keyspace and the keyspaces with image coverage. The caller
+    /// decides how to deal with these two keyspaces.
     async fn get_vectored_reconstruct_data_timeline(
         timeline: &Timeline,
         keyspace: KeySpace,
@@ -3359,20 +3433,27 @@ impl Timeline {
         reconstruct_state: &mut ValuesReconstructState,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<KeySpace, GetVectoredError> {
+    ) -> Result<TimelineVisitOutcome, GetVectoredError> {
         let mut unmapped_keyspace = keyspace.clone();
         let mut fringe = LayerFringe::new();
 
         let mut completed_keyspace = KeySpace::default();
+        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
 
         loop {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
-            let keys_done_last_step = reconstruct_state.consume_done_keys();
+            let (keys_done_last_step, keys_with_image_coverage) =
+                reconstruct_state.consume_done_keys();
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
             completed_keyspace.merge(&keys_done_last_step);
+            if let Some(keys_with_image_coverage) = keys_with_image_coverage {
+                unmapped_keyspace
+                    .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone()));
+                image_covered_keyspace.add_range(keys_with_image_coverage);
+            }
 
             // Do not descent any further if the last layer we visited
             // completed all keys in the keyspace it inspected. This is not
@@ -3444,13 +3525,16 @@ impl Timeline {
                 unmapped_keyspace = keyspace_to_read;
                 cont_lsn = next_cont_lsn;
 
-                reconstruct_state.on_layer_visited();
+                reconstruct_state.on_layer_visited(&layer_to_read);
             } else {
                 break;
             }
         }
 
-        Ok(completed_keyspace)
+        Ok(TimelineVisitOutcome {
+            completed_keyspace,
+            image_covered_keyspace: image_covered_keyspace.consume_keyspace(),
+        })
     }
 
     /// # Cancel-safety
@@ -3551,7 +3635,11 @@ impl Timeline {
     ///
     /// Get a handle to the latest layer for appending.
     ///
-    async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
+    async fn get_layer_for_write(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
         let layer = guard
             .get_layer_for_write(
@@ -3560,6 +3648,7 @@ impl Timeline {
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
+                ctx,
             )
             .await?;
         Ok(layer)
@@ -3824,8 +3913,8 @@ impl Timeline {
                 );
                 self.create_delta_layer(
                     &frozen_layer,
-                    ctx,
                     Some(metadata_keyspace.0.ranges[0].clone()),
+                    ctx,
                 )
                 .await?
             } else {
@@ -3854,7 +3943,7 @@ impl Timeline {
             // Normal case, write out a L0 delta layer file.
             // `create_delta_layer` will not modify the layer map.
             // We will remove frozen layer and add delta layer in one atomic operation later.
-            let Some(layer) = self.create_delta_layer(&frozen_layer, ctx, None).await? else {
+            let Some(layer) = self.create_delta_layer(&frozen_layer, None, ctx).await? else {
                 panic!("delta layer cannot be empty if no filter is applied");
             };
             (
@@ -3953,29 +4042,23 @@ impl Timeline {
             x.unwrap()
         ));
 
-        if let Some(remote_client) = &self.remote_client {
-            for layer in layers_to_upload {
-                remote_client.schedule_layer_file_upload(layer)?;
-            }
-            remote_client.schedule_index_upload_for_metadata_update(&update)?;
+        for layer in layers_to_upload {
+            self.remote_client.schedule_layer_file_upload(layer)?;
         }
+        self.remote_client
+            .schedule_index_upload_for_metadata_update(&update)?;
 
         Ok(())
     }
 
     pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
-        if let Some(remote_client) = &self.remote_client {
-            remote_client
-                .preserve_initdb_archive(
-                    &self.tenant_shard_id.tenant_id,
-                    &self.timeline_id,
-                    &self.cancel,
-                )
-                .await?;
-        } else {
-            bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
-        }
-        Ok(())
+        self.remote_client
+            .preserve_initdb_archive(
+                &self.tenant_shard_id.tenant_id,
+                &self.timeline_id,
+                &self.cancel,
+            )
+            .await
     }
 
     // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
@@ -3983,8 +4066,8 @@ impl Timeline {
     async fn create_delta_layer(
         self: &Arc<Self>,
         frozen_layer: &Arc<InMemoryLayer>,
-        ctx: &RequestContext,
         key_range: Option<Range<Key>>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Option<ResidentLayer>> {
         let self_clone = Arc::clone(self);
         let frozen_layer = Arc::clone(frozen_layer);
@@ -4007,6 +4090,7 @@ impl Timeline {
                 &self_clone
                     .conf
                     .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id),
+                &ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4130,6 +4214,176 @@ impl Timeline {
         false
     }
 
+    /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
+    /// so that at most one image layer will be produced from this function.
+    async fn create_image_layer_for_rel_blocks(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        start: Key,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        let mut wrote_keys = false;
+
+        let mut key_request_accum = KeySpaceAccum::new();
+        for range in &partition.ranges {
+            let mut key = range.start;
+            while key < range.end {
+                // Decide whether to retain this key: usually we do, but sharded tenants may
+                // need to drop keys that don't belong to them.  If we retain the key, add it
+                // to `key_request_accum` for later issuing a vectored get
+                if self.shard_identity.is_key_disposable(&key) {
+                    debug!(
+                        "Dropping key {} during compaction (it belongs on shard {:?})",
+                        key,
+                        self.shard_identity.get_shard_number(&key)
+                    );
+                } else {
+                    key_request_accum.add_key(key);
+                }
+
+                let last_key_in_range = key.next() == range.end;
+                key = key.next();
+
+                // Maybe flush `key_rest_accum`
+                if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
+                    || (last_key_in_range && key_request_accum.raw_size() > 0)
+                {
+                    let results = self
+                        .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
+                        .await?;
+
+                    for (img_key, img) in results {
+                        let img = match img {
+                            Ok(img) => img,
+                            Err(err) => {
+                                // If we fail to reconstruct a VM or FSM page, we can zero the
+                                // page without losing any actual user data. That seems better
+                                // than failing repeatedly and getting stuck.
+                                //
+                                // We had a bug at one point, where we truncated the FSM and VM
+                                // in the pageserver, but the Postgres didn't know about that
+                                // and continued to generate incremental WAL records for pages
+                                // that didn't exist in the pageserver. Trying to replay those
+                                // WAL records failed to find the previous image of the page.
+                                // This special case allows us to recover from that situation.
+                                // See https://github.com/neondatabase/neon/issues/2601.
+                                //
+                                // Unfortunately we cannot do this for the main fork, or for
+                                // any metadata keys, keys, as that would lead to actual data
+                                // loss.
+                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
+                                    warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                    ZERO_PAGE.clone()
+                                } else {
+                                    return Err(CreateImageLayersError::PageReconstructError(err));
+                                }
+                            }
+                        };
+
+                        // Write all the keys we just read into our new image layer.
+                        image_layer_writer.put_image(img_key, img, ctx).await?;
+                        wrote_keys = true;
+                    }
+                }
+            }
+        }
+
+        if wrote_keys {
+            // Normal path: we have written some data into the new image layer for this
+            // partition, so flush it to disk.
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            Ok(ImageLayerCreationOutcome {
+                image: Some(image_layer),
+                next_start_key: img_range.end,
+            })
+        } else {
+            // Special case: the image layer may be empty if this is a sharded tenant and the
+            // partition does not cover any keys owned by this shard.  In this case, to ensure
+            // we don't leave gaps between image layers, leave `start` where it is, so that the next
+            // layer we write will cover the key range that we just scanned.
+            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: start,
+            })
+        }
+    }
+
+    /// Create an image layer for metadata keys. This function produces one image layer for all metadata
+    /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
+    /// would not be too large to fit in a single image layer.
+    #[allow(clippy::too_many_arguments)]
+    async fn create_image_layer_for_metadata_keys(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        mode: ImageLayerCreationMode,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
+
+        // Metadata keys image layer creation.
+        let mut reconstruct_state = ValuesReconstructState::default();
+        let data = self
+            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
+            .await?;
+        let (data, total_kb_retrieved, total_key_retrieved) = {
+            let mut new_data = BTreeMap::new();
+            let mut total_kb_retrieved = 0;
+            let mut total_key_retrieved = 0;
+            for (k, v) in data {
+                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
+                total_kb_retrieved += KEY_SIZE + v.len();
+                total_key_retrieved += 1;
+                new_data.insert(k, v);
+            }
+            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
+        };
+        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
+
+        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        info!(
+            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
+                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
+                total_key_retrieved={total_key_retrieved}"
+        );
+        if !trigger_generation && mode == ImageLayerCreationMode::Try {
+            return Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: img_range.end,
+            });
+        }
+        let has_keys = !data.is_empty();
+        for (k, v) in data {
+            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
+            // considers this situation properly.
+            // if v.is_empty() {
+            //     continue;
+            // }
+
+            // No need to handle sharding b/c metadata keys are always on the 0-th shard.
+
+            // TODO: split image layers to avoid too large layer files. Too large image files are not handled
+            // on the normal data path either.
+            image_layer_writer.put_image(k, v, ctx).await?;
+        }
+        Ok(ImageLayerCreationOutcome {
+            image: if has_keys {
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
+                Some(image_layer)
+            } else {
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                None
+            },
+            next_start_key: img_range.end,
+        })
+    }
+
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4171,19 +4425,17 @@ impl Timeline {
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
-
-            if partition.overlaps(&Key::metadata_key_range()) {
-                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
-                // rather big change. Keep this patch small for now.
-                match mode {
-                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
-                        // skip image layer creation anyways for metadata keys.
-                        start = img_range.end;
-                        continue;
-                    }
-                    ImageLayerCreationMode::Initial => {
-                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                    }
+            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
+            if compact_metadata {
+                for range in &partition.ranges {
+                    assert!(
+                        range.start.field1 >= METADATA_KEY_BEGIN_PREFIX
+                            && range.end.field1 <= METADATA_KEY_END_PREFIX,
+                        "metadata keys must be partitioned separately"
+                    );
+                }
+                if mode == ImageLayerCreationMode::Initial {
+                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
                 }
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
@@ -4194,12 +4446,13 @@ impl Timeline {
                 }
             }
 
-            let mut image_layer_writer = ImageLayerWriter::new(
+            let image_layer_writer = ImageLayerWriter::new(
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
                 &img_range,
                 lsn,
+                ctx,
             )
             .await?;
 
@@ -4209,87 +4462,39 @@ impl Timeline {
                 )))
             });
 
-            let mut wrote_keys = false;
+            if !compact_metadata {
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_rel_blocks(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        start,
+                    )
+                    .await?;
 
-            let mut key_request_accum = KeySpaceAccum::new();
-            for range in &partition.ranges {
-                let mut key = range.start;
-                while key < range.end {
-                    // Decide whether to retain this key: usually we do, but sharded tenants may
-                    // need to drop keys that don't belong to them.  If we retain the key, add it
-                    // to `key_request_accum` for later issuing a vectored get
-                    if self.shard_identity.is_key_disposable(&key) {
-                        debug!(
-                            "Dropping key {} during compaction (it belongs on shard {:?})",
-                            key,
-                            self.shard_identity.get_shard_number(&key)
-                        );
-                    } else {
-                        key_request_accum.add_key(key);
-                    }
-
-                    let last_key_in_range = key.next() == range.end;
-                    key = key.next();
-
-                    // Maybe flush `key_rest_accum`
-                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
-                        || last_key_in_range
-                    {
-                        let results = self
-                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
-                            .await?;
-
-                        for (img_key, img) in results {
-                            let img = match img {
-                                Ok(img) => img,
-                                Err(err) => {
-                                    // If we fail to reconstruct a VM or FSM page, we can zero the
-                                    // page without losing any actual user data. That seems better
-                                    // than failing repeatedly and getting stuck.
-                                    //
-                                    // We had a bug at one point, where we truncated the FSM and VM
-                                    // in the pageserver, but the Postgres didn't know about that
-                                    // and continued to generate incremental WAL records for pages
-                                    // that didn't exist in the pageserver. Trying to replay those
-                                    // WAL records failed to find the previous image of the page.
-                                    // This special case allows us to recover from that situation.
-                                    // See https://github.com/neondatabase/neon/issues/2601.
-                                    //
-                                    // Unfortunately we cannot do this for the main fork, or for
-                                    // any metadata keys, keys, as that would lead to actual data
-                                    // loss.
-                                    if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
-                                    {
-                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
-                                        ZERO_PAGE.clone()
-                                    } else {
-                                        return Err(CreateImageLayersError::PageReconstructError(
-                                            err,
-                                        ));
-                                    }
-                                }
-                            };
-
-                            // Write all the keys we just read into our new image layer.
-                            image_layer_writer.put_image(img_key, img, ctx).await?;
-                            wrote_keys = true;
-                        }
-                    }
-                }
-            }
-
-            if wrote_keys {
-                // Normal path: we have written some data into the new image layer for this
-                // partition, so flush it to disk.
-                start = img_range.end;
-                let image_layer = image_layer_writer.finish(self, ctx).await?;
-                image_layers.push(image_layer);
+                start = next_start_key;
+                image_layers.extend(image);
             } else {
-                // Special case: the image layer may be empty if this is a sharded tenant and the
-                // partition does not cover any keys owned by this shard.  In this case, to ensure
-                // we don't leave gaps between image layers, leave `start` where it is, so that the next
-                // layer we write will cover the key range that we just scanned.
-                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_metadata_keys(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        mode,
+                    )
+                    .await?;
+                start = next_start_key;
+                image_layers.extend(image);
             }
         }
 
@@ -4304,6 +4509,7 @@ impl Timeline {
                 &self
                     .conf
                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -4328,6 +4534,16 @@ impl Timeline {
     /// this Timeline is shut down.  Calling this function will cause the initial
     /// logical size calculation to skip waiting for the background jobs barrier.
     pub(crate) async fn await_initial_logical_size(self: Arc<Self>) {
+        if !self.shard_identity.is_shard_zero() {
+            // We don't populate logical size on shard >0: skip waiting for it.
+            return;
+        }
+
+        if self.remote_client.is_deleting() {
+            // The timeline was created in a deletion-resume state, we don't expect logical size to be populated
+            return;
+        }
+
         if let Some(await_bg_cancel) = self
             .current_logical_size
             .cancel_wait_for_background_loop_concurrency_limit_semaphore
@@ -4339,9 +4555,10 @@ impl Timeline {
             // the logical size cancellation to skip the concurrency limit semaphore.
             // TODO: this is an unexpected case.  We should restructure so that it
             // can't happen.
-            tracing::info!(
+            tracing::warn!(
                 "await_initial_logical_size: can't get semaphore cancel token, skipping"
             );
+            debug_assert!(false);
         }
 
         tokio::select!(
@@ -4357,7 +4574,6 @@ impl Timeline {
     /// - has an ancestor to detach from
     /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
     /// a technical requirement
-    /// - has prev_lsn in remote storage (temporary restriction)
     ///
     /// After the operation has been started, it cannot be canceled. Upon restart it needs to be
     /// polled again until completion.
@@ -4392,6 +4608,14 @@ impl Timeline {
     ) -> Result<Vec<TimelineId>, anyhow::Error> {
         detach_ancestor::complete(self, tenant, prepared, ctx).await
     }
+
+    /// Switch aux file policy and schedule upload to the index part.
+    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
+        self.last_aux_file_policy.store(Some(policy));
+        self.remote_client
+            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
+        Ok(())
+    }
 }
 
 /// Top-level failure to compact.
@@ -4491,9 +4715,8 @@ impl Timeline {
         // deletion will happen later, the layer file manager calls garbage_collect_on_drop
         guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics);
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&remove_layers, new_deltas)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&remove_layers, new_deltas)?;
 
         drop_wlock(guard);
 
@@ -4511,9 +4734,8 @@ impl Timeline {
 
         let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            remote_client.schedule_compaction_update(&drop_layers, &upload_layers)?;
-        }
+        self.remote_client
+            .schedule_compaction_update(&drop_layers, &upload_layers)?;
 
         Ok(())
     }
@@ -4523,16 +4745,14 @@ impl Timeline {
         self: &Arc<Self>,
         new_images: impl IntoIterator<Item = ResidentLayer>,
     ) -> anyhow::Result<()> {
-        let Some(remote_client) = &self.remote_client else {
-            return Ok(());
-        };
         for layer in new_images {
-            remote_client.schedule_layer_file_upload(layer)?;
+            self.remote_client.schedule_layer_file_upload(layer)?;
         }
         // should any new image layer been created, not uploading index_part will
         // result in a mismatch between remote_physical_size and layermap calculated
         // size, which will fail some tests, but should not be an issue otherwise.
-        remote_client.schedule_index_upload_for_file_changes()?;
+        self.remote_client
+            .schedule_index_upload_for_file_changes()?;
         Ok(())
     }
 
@@ -4630,11 +4850,9 @@ impl Timeline {
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
         // this is most likely the background tasks, but it might be the spawned task from
         // immediate_gc
-        let cancel = crate::task_mgr::shutdown_token();
         let _g = tokio::select! {
             guard = self.gc_lock.lock() => guard,
             _ = self.cancel.cancelled() => return Ok(GcResult::default()),
-            _ = cancel.cancelled() => return Ok(GcResult::default()),
         };
         let timer = self.metrics.garbage_collect_histo.start_timer();
 
@@ -4654,7 +4872,32 @@ impl Timeline {
             (horizon_cutoff, pitr_cutoff, retain_lsns)
         };
 
-        let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let standby_horizon = self.standby_horizon.load();
+        // Hold GC for the standby, but as a safety guard do it only within some
+        // reasonable lag.
+        if standby_horizon != Lsn::INVALID {
+            if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
+                const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
+                if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
+                    new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
+                    trace!("holding off GC for standby apply LSN {}", standby_horizon);
+                } else {
+                    warn!(
+                        "standby is lagging for more than {}MB, not holding gc for it",
+                        MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
+                    )
+                }
+            }
+        }
+
+        // Reset standby horizon to ignore it if it is not updated till next GC.
+        // It is an easy way to unset it when standby disappears without adding
+        // more conf options.
+        self.standby_horizon.store(Lsn::INVALID);
+        self.metrics
+            .standby_horizon_gauge
+            .set(Lsn::INVALID.0 as i64);
 
         let res = self
             .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
@@ -4731,7 +4974,7 @@ impl Timeline {
             if l.get_lsn_range().end > horizon_cutoff {
                 debug!(
                     "keeping {} because it's newer than horizon_cutoff {}",
-                    l.filename(),
+                    l.layer_name(),
                     horizon_cutoff,
                 );
                 result.layers_needed_by_cutoff += 1;
@@ -4742,7 +4985,7 @@ impl Timeline {
             if l.get_lsn_range().end > pitr_cutoff {
                 debug!(
                     "keeping {} because it's newer than pitr_cutoff {}",
-                    l.filename(),
+                    l.layer_name(),
                     pitr_cutoff,
                 );
                 result.layers_needed_by_pitr += 1;
@@ -4761,7 +5004,7 @@ impl Timeline {
                 if &l.get_lsn_range().start <= retain_lsn {
                     debug!(
                         "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
-                        l.filename(),
+                        l.layer_name(),
                         retain_lsn,
                         l.is_incremental(),
                     );
@@ -4792,7 +5035,7 @@ impl Timeline {
             if !layers
                 .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
             {
-                debug!("keeping {} because it is the latest layer", l.filename());
+                debug!("keeping {} because it is the latest layer", l.layer_name());
                 result.layers_not_updated += 1;
                 continue 'outer;
             }
@@ -4800,7 +5043,7 @@ impl Timeline {
             // We didn't find any reason to keep this file, so remove it.
             debug!(
                 "garbage collecting {} is_dropped: xx is_incremental: {}",
-                l.filename(),
+                l.layer_name(),
                 l.is_incremental(),
             );
             layers_to_remove.push(l);
@@ -4820,9 +5063,7 @@ impl Timeline {
 
             result.layers_removed = gc_layers.len() as u64;
 
-            if let Some(remote_client) = self.remote_client.as_ref() {
-                remote_client.schedule_gc_update(&gc_layers)?;
-            }
+            self.remote_client.schedule_gc_update(&gc_layers)?;
 
             guard.finish_gc_timeline(&gc_layers);
 
@@ -5206,7 +5447,7 @@ impl<'a> TimelineWriter<'a> {
         let buf_size: u64 = buf.len().try_into().expect("oversized value buf");
 
         let action = self.get_open_layer_action(lsn, buf_size);
-        let layer = self.handle_open_layer_action(lsn, action).await?;
+        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
         let res = layer.put_value(key, lsn, &buf, ctx).await;
 
         if res.is_ok() {
@@ -5229,14 +5470,15 @@ impl<'a> TimelineWriter<'a> {
         &mut self,
         at: Lsn,
         action: OpenLayerAction,
+        ctx: &RequestContext,
     ) -> anyhow::Result<&Arc<InMemoryLayer>> {
         match action {
             OpenLayerAction::Roll => {
                 let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap();
                 self.roll_layer(freeze_at).await?;
-                self.open_layer(at).await?;
+                self.open_layer(at, ctx).await?;
             }
-            OpenLayerAction::Open => self.open_layer(at).await?,
+            OpenLayerAction::Open => self.open_layer(at, ctx).await?,
             OpenLayerAction::None => {
                 assert!(self.write_guard.is_some());
             }
@@ -5245,8 +5487,8 @@ impl<'a> TimelineWriter<'a> {
         Ok(&self.write_guard.as_ref().unwrap().open_layer)
     }
 
-    async fn open_layer(&mut self, at: Lsn) -> anyhow::Result<()> {
-        let layer = self.tl.get_layer_for_write(at).await?;
+    async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> {
+        let layer = self.tl.get_layer_for_write(at, ctx).await?;
         let initial_size = layer.size().await?;
 
         let last_freeze_at = self.last_freeze_at.load();
@@ -5323,10 +5565,14 @@ impl<'a> TimelineWriter<'a> {
         Ok(())
     }
 
-    pub(crate) async fn delete_batch(&mut self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+    pub(crate) async fn delete_batch(
+        &mut self,
+        batch: &[(Range<Key>, Lsn)],
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         if let Some((_, lsn)) = batch.first() {
             let action = self.get_open_layer_action(*lsn, 0);
-            let layer = self.handle_open_layer_action(*lsn, action).await?;
+            let layer = self.handle_open_layer_action(*lsn, action, ctx).await?;
             layer.put_tombstones(batch).await?;
         }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index e83878b8fb..2eff469591 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -116,9 +116,13 @@ impl Timeline {
 
                 // 3. Create new image layers for partitions that have been modified
                 // "enough".
-                let dense_layers = self
+                let mut partitioning = dense_partitioning;
+                partitioning
+                    .parts
+                    .extend(sparse_partitioning.into_dense().parts);
+                let image_layers = self
                     .create_image_layers(
-                        &dense_partitioning,
+                        &partitioning,
                         lsn,
                         if flags.contains(CompactFlags::ForceImageLayerCreation) {
                             ImageLayerCreationMode::Force
@@ -130,24 +134,8 @@ impl Timeline {
                     .await
                     .map_err(anyhow::Error::from)?;
 
-                // For now, nothing will be produced...
-                let sparse_layers = self
-                    .create_image_layers(
-                        &sparse_partitioning.clone().into_dense(),
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                assert!(sparse_layers.is_empty());
-
-                self.upload_new_image_layers(dense_layers)?;
-                dense_partitioning.parts.len()
+                self.upload_new_image_layers(image_layers)?;
+                partitioning.parts.len()
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -295,13 +283,11 @@ impl Timeline {
         // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
         self.rewrite_layers(replace_layers, drop_layers).await?;
 
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            // We wait for all uploads to complete before finishing this compaction stage.  This is not
-            // necessary for correctness, but it simplifies testing, and avoids proceeding with another
-            // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
-            // load.
-            remote_client.wait_completion().await?;
-        }
+        // We wait for all uploads to complete before finishing this compaction stage.  This is not
+        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
+        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
+        // load.
+        self.remote_client.wait_completion().await?;
 
         Ok(())
     }
@@ -501,8 +487,11 @@ impl Timeline {
 
         for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
             if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                // compaction is the gap between data key and metadata keys.
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                    && !Key::is_metadata_key(&prev_key)
+                {
                     let key_range = prev_key..next_key;
                     // Measuring hole by just subtraction of i128 representation of key range boundaries
                     // has not so much sense, because largest holes will corresponds field1/field2 changes.
@@ -700,6 +689,7 @@ impl Timeline {
                                 debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                 lsn_range.clone()
                             },
+                            ctx,
                         )
                         .await?,
                     );
@@ -755,6 +745,7 @@ impl Timeline {
                 &self
                     .conf
                     .timeline_path(&self.tenant_shard_id, &self.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -1093,6 +1084,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range.start,
             lsn_range.clone(),
+            ctx,
         )
         .await?;
 
@@ -1167,6 +1159,7 @@ impl TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range,
             lsn,
+            ctx,
         )
         .await?;
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index d8701be170..b5dfc86e77 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -26,19 +26,21 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        match remote_client.persist_index_part_with_deleted_flag().await {
-            // If we (now, or already) marked it successfully as deleted, we can proceed
-            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-            // Bail out otherwise
-            //
-            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-            // two tasks from performing the deletion at the same time. The first task
-            // that starts deletion should run it to completion.
-            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-            }
+    match timeline
+        .remote_client
+        .persist_index_part_with_deleted_flag()
+        .await
+    {
+        // If we (now, or already) marked it successfully as deleted, we can proceed
+        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+        // Bail out otherwise
+        //
+        // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+        // two tasks from performing the deletion at the same time. The first task
+        // that starts deletion should run it to completion.
+        Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+        | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+            return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
         }
     }
     Ok(())
@@ -117,11 +119,11 @@ pub(super) async fn delete_local_timeline_directory(
 
 /// Removes remote layers and an index file after them.
 async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    if let Some(remote_client) = &timeline.remote_client {
-        remote_client.delete_all().await.context("delete_all")?
-    };
-
-    Ok(())
+    timeline
+        .remote_client
+        .delete_all()
+        .await
+        .context("delete_all")
 }
 
 // This function removs remaining traces of a timeline on disk.
@@ -260,7 +262,7 @@ impl DeleteTimelineFlow {
         tenant: Arc<Tenant>,
         timeline_id: TimelineId,
         local_metadata: &TimelineMetadata,
-        remote_client: Option<RemoteTimelineClient>,
+        remote_client: RemoteTimelineClient,
         deletion_queue_client: DeletionQueueClient,
     ) -> anyhow::Result<()> {
         // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -278,6 +280,8 @@ impl DeleteTimelineFlow {
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
+                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
+                None,
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 5c2b25da56..e6ddabe5b5 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
+use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
 
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
@@ -22,8 +22,6 @@ pub(crate) enum Error {
     TooManyAncestors,
     #[error("shutting down, please retry later")]
     ShuttingDown,
-    #[error("detached timeline must receive writes before the operation")]
-    DetachedTimelineNeedsWrites,
     #[error("flushing failed")]
     FlushAncestor(#[source] anyhow::Error),
     #[error("layer download failed")]
@@ -43,6 +41,27 @@ pub(crate) enum Error {
     Unexpected(#[source] anyhow::Error),
 }
 
+impl From<Error> for ApiError {
+    fn from(value: Error) -> Self {
+        match value {
+            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
+            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
+            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
+            Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::OtherTimelineDetachOngoing(_) => {
+                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
+            }
+            // All of these contain shutdown errors, in fact, it's the most common
+            e @ Error::FlushAncestor(_)
+            | e @ Error::RewrittenDeltaDownloadFailed(_)
+            | e @ Error::CopyDeltaPrefix(_)
+            | e @ Error::UploadRewritten(_)
+            | e @ Error::CopyFailed(_)
+            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
+        }
+    }
+}
+
 pub(crate) struct PreparedTimelineDetach {
     layers: Vec<Layer>,
 }
@@ -58,7 +77,7 @@ impl Default for Options {
     fn default() -> Self {
         Self {
             rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
-            copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
+            copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
         }
     }
 }
@@ -72,15 +91,16 @@ pub(super) async fn prepare(
 ) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
     use Error::*;
 
-    if detached.remote_client.as_ref().is_none() {
-        unimplemented!("no new code for running without remote storage");
-    }
-
     let Some((ancestor, ancestor_lsn)) = detached
         .ancestor_timeline
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
+        // TODO: check if we have already been detached; for this we need to read the stored data
+        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
+        // a projection of the commited data.
+        //
+        // the error is wrong per openapi
         return Err(NoAncestor);
     };
 
@@ -90,18 +110,10 @@ pub(super) async fn prepare(
 
     if ancestor.ancestor_timeline.is_some() {
         // non-technical requirement; we could flatten N ancestors just as easily but we chose
-        // not to
+        // not to, at least initially
         return Err(TooManyAncestors);
     }
 
-    if detached.get_prev_record_lsn() == Lsn::INVALID
-        || detached.disk_consistent_lsn.load() == ancestor_lsn
-    {
-        // this is to avoid a problem that after detaching we would be unable to start up the
-        // compute because of "PREV_LSN: invalid".
-        return Err(DetachedTimelineNeedsWrites);
-    }
-
     // before we acquire the gate, we must mark the ancestor as having a detach operation
     // ongoing which will block other concurrent detach operations so we don't get to ackward
     // situations where there would be two branches trying to reparent earlier branches.
@@ -225,6 +237,7 @@ pub(super) async fn prepare(
                 &detached
                     .conf
                     .timeline_path(&detached.tenant_shard_id, &detached.timeline_id),
+                ctx,
             )
             .await
             .fatal_err("VirtualFile::open for timeline dir fsync");
@@ -324,8 +337,6 @@ async fn upload_rewritten_layer(
     // FIXME: better shuttingdown error
     target
         .remote_client
-        .as_ref()
-        .unwrap()
         .upload_layer_file(&copied, cancel)
         .await
         .map_err(UploadRewritten)?;
@@ -349,6 +360,7 @@ async fn copy_lsn_prefix(
         target_timeline.tenant_shard_id,
         layer.layer_desc().key_range.start,
         layer.layer_desc().lsn_range.start..end_lsn,
+        ctx,
     )
     .await
     .map_err(CopyDeltaPrefix)?;
@@ -407,15 +419,13 @@ async fn remote_copy(
     let owned = crate::tenant::storage_layer::Layer::for_evicted(
         adoptee.conf,
         adoptee,
-        adopted.layer_desc().filename(),
+        adopted.layer_desc().layer_name(),
         metadata,
     );
 
     // FIXME: better shuttingdown error
     adoptee
         .remote_client
-        .as_ref()
-        .unwrap()
         .copy_timeline_layer(adopted, &owned, cancel)
         .await
         .map(move |()| owned)
@@ -429,11 +439,6 @@ pub(super) async fn complete(
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
 ) -> Result<Vec<TimelineId>, anyhow::Error> {
-    let rtc = detached
-        .remote_client
-        .as_ref()
-        .expect("has to have a remote timeline client for timeline ancestor detach");
-
     let PreparedTimelineDetach { layers } = prepared;
 
     let ancestor = detached
@@ -450,11 +455,13 @@ pub(super) async fn complete(
     //
     // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
     // which could give us a completely wrong layer combination.
-    rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
-        &layers,
-        (ancestor.timeline_id, ancestor_lsn),
-    )
-    .await?;
+    detached
+        .remote_client
+        .schedule_adding_existing_layers_to_index_detach_and_wait(
+            &layers,
+            (ancestor.timeline_id, ancestor_lsn),
+        )
+        .await?;
 
     let mut tasks = tokio::task::JoinSet::new();
 
@@ -499,8 +506,6 @@ pub(super) async fn complete(
                 async move {
                     let res = timeline
                         .remote_client
-                        .as_ref()
-                        .expect("reparented has to have remote client because detached has one")
                         .schedule_reparenting_and_wait(&new_parent)
                         .await;
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 3567761b9a..8a8c38d0ce 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -23,7 +23,7 @@ use std::{
 use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
+use tracing::{debug, info, info_span, instrument, warn, Instrument};
 
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -211,11 +211,6 @@ impl Timeline {
 
         // So, we just need to deal with this.
 
-        if self.remote_client.is_none() {
-            error!("no remote storage configured, cannot evict layers");
-            return ControlFlow::Continue(());
-        }
-
         let mut js = tokio::task::JoinSet::new();
         {
             let guard = self.layers.read().await;
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 9c33981807..feadc79e5e 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -6,10 +6,9 @@ use crate::{
             self,
             index::{IndexPart, LayerFileMetadata},
         },
-        storage_layer::LayerFileName,
+        storage_layer::LayerName,
         Generation,
     },
-    METADATA_FILE_NAME,
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
@@ -20,15 +19,13 @@ use utils::lsn::Lsn;
 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
     /// The only one we care about
-    Layer(LayerFileName, Utf8PathBuf, u64),
+    Layer(LayerName, Utf8PathBuf, u64),
     /// Old ephmeral files from previous launches, should be removed
     Ephemeral(String),
     /// Old temporary timeline files, unsure what these really are, should be removed
     Temporary(String),
     /// Temporary on-demand download files, should be removed
     TemporaryDownload(String),
-    /// "metadata" file we persist locally and include in `index_part.json`
-    Metadata,
     /// Backup file from previously future layers
     IgnoredBackup,
     /// Unrecognized, warn about these
@@ -43,15 +40,13 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
         let direntry = direntry?;
         let file_name = direntry.file_name().to_string();
 
-        let discovered = match LayerFileName::from_str(&file_name) {
+        let discovered = match LayerName::from_str(&file_name) {
             Ok(file_name) => {
                 let file_size = direntry.metadata()?.len();
                 Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
             }
             Err(_) => {
-                if file_name == METADATA_FILE_NAME {
-                    Discovered::Metadata
-                } else if file_name.ends_with(".old") {
+                if file_name.ends_with(".old") {
                     // ignore these
                     Discovered::IgnoredBackup
                 } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
@@ -72,6 +67,28 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
     Ok(ret)
 }
 
+/// Whereas `LayerFileMetadata` describes the metadata we would store in remote storage,
+/// this structure extends it with metadata describing the layer's presence in local storage.
+#[derive(Clone, Debug)]
+pub(super) struct LocalLayerFileMetadata {
+    pub(super) metadata: LayerFileMetadata,
+    pub(super) local_path: Utf8PathBuf,
+}
+
+impl LocalLayerFileMetadata {
+    pub fn new(
+        local_path: Utf8PathBuf,
+        file_size: u64,
+        generation: Generation,
+        shard: ShardIndex,
+    ) -> Self {
+        Self {
+            local_path,
+            metadata: LayerFileMetadata::new(file_size, generation, shard),
+        }
+    }
+}
+
 /// Decision on what to do with a layer file after considering its local and remote metadata.
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
@@ -80,11 +97,11 @@ pub(super) enum Decision {
     /// The layer is present locally, but local metadata does not match remote; we must
     /// delete it and treat it as evicted.
     UseRemote {
-        local: LayerFileMetadata,
+        local: LocalLayerFileMetadata,
         remote: LayerFileMetadata,
     },
     /// The layer is present locally, and metadata matches.
-    UseLocal(LayerFileMetadata),
+    UseLocal(LocalLayerFileMetadata),
 }
 
 /// A layer needs to be left out of the layer map.
@@ -92,39 +109,29 @@ pub(super) enum Decision {
 pub(super) enum DismissedLayer {
     /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
     Future {
-        /// The local metadata. `None` if the layer is only known through [`IndexPart`].
-        local: Option<LayerFileMetadata>,
+        /// `None` if the layer is only known through [`IndexPart`].
+        local: Option<LocalLayerFileMetadata>,
     },
     /// The layer only exists locally.
     ///
     /// In order to make crash safe updates to layer map, we must dismiss layers which are only
     /// found locally or not yet included in the remote `index_part.json`.
-    LocalOnly(LayerFileMetadata),
+    LocalOnly(LocalLayerFileMetadata),
 }
 
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerFileName, Utf8PathBuf, u64)>,
+    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
     index_part: Option<&IndexPart>,
     disk_consistent_lsn: Lsn,
     generation: Generation,
     shard: ShardIndex,
-) -> Vec<(
-    LayerFileName,
-    Option<Utf8PathBuf>,
-    Result<Decision, DismissedLayer>,
-)> {
+) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
     use Decision::*;
 
-    // name => (local_path, local_metadata, remote_metadata)
-    type Collected = HashMap<
-        LayerFileName,
-        (
-            Option<Utf8PathBuf>,
-            Option<LayerFileMetadata>,
-            Option<LayerFileMetadata>,
-        ),
-    >;
+    // name => (local_metadata, remote_metadata)
+    type Collected =
+        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
 
     let mut discovered = discovered
         .into_iter()
@@ -135,8 +142,9 @@ pub(super) fn reconcile(
                 // it is not in IndexPart, in which case using our current generation makes sense
                 // because it will be uploaded in this generation.
                 (
-                    Some(local_path),
-                    Some(LayerFileMetadata::new(file_size, generation, shard)),
+                    Some(LocalLayerFileMetadata::new(
+                        local_path, file_size, generation, shard,
+                    )),
                     None,
                 ),
             )
@@ -152,20 +160,20 @@ pub(super) fn reconcile(
         .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
         .for_each(|(name, metadata)| {
             if let Some(existing) = discovered.get_mut(name) {
-                existing.2 = Some(metadata);
+                existing.1 = Some(metadata);
             } else {
-                discovered.insert(name.to_owned(), (None, None, Some(metadata)));
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
             }
         });
 
     discovered
         .into_iter()
-        .map(|(name, (local_path, local, remote))| {
+        .map(|(name, (local, remote))| {
             let decision = if name.is_in_future(disk_consistent_lsn) {
                 Err(DismissedLayer::Future { local })
             } else {
                 match (local, remote) {
-                    (Some(local), Some(remote)) if local != remote => {
+                    (Some(local), Some(remote)) if local.metadata != remote => {
                         Ok(UseRemote { local, remote })
                     }
                     (Some(x), Some(_)) => Ok(UseLocal(x)),
@@ -177,7 +185,7 @@ pub(super) fn reconcile(
                 }
             };
 
-            (name, local_path, decision)
+            (name, decision)
         })
         .collect::<Vec<_>>()
 }
@@ -189,12 +197,12 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
 }
 
 pub(super) fn cleanup_local_file_for_remote(
-    path: &Utf8Path,
-    local: &LayerFileMetadata,
+    local: &LocalLayerFileMetadata,
     remote: &LayerFileMetadata,
 ) -> anyhow::Result<()> {
-    let local_size = local.file_size();
+    let local_size = local.metadata.file_size();
     let remote_size = remote.file_size();
+    let path = &local.local_path;
 
     let file_name = path.file_name().expect("must be file path");
     tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
@@ -211,7 +219,7 @@ pub(super) fn cleanup_local_file_for_remote(
 
 pub(super) fn cleanup_future_layer(
     path: &Utf8Path,
-    name: &LayerFileName,
+    name: &LayerName,
     disk_consistent_lsn: Lsn,
 ) -> anyhow::Result<()> {
     // future image layers are allowed to be produced always for not yet flushed to disk
@@ -223,12 +231,14 @@ pub(super) fn cleanup_future_layer(
 }
 
 pub(super) fn cleanup_local_only_file(
-    path: &Utf8Path,
-    name: &LayerFileName,
-    local: &LayerFileMetadata,
+    name: &LayerName,
+    local: &LocalLayerFileMetadata,
 ) -> anyhow::Result<()> {
     let kind = name.kind();
-    tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
-    std::fs::remove_file(path)?;
+    tracing::info!(
+        "found local-only {kind} layer {name}, metadata {:?}",
+        local.metadata
+    );
+    std::fs::remove_file(&local.local_path)?;
     Ok(())
 }
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 8e8d64e0c6..248420e632 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -9,6 +9,7 @@ use utils::{
 
 use crate::{
     config::PageServerConf,
+    context::RequestContext,
     metrics::TimelineMetrics,
     tenant::{
         layer_map::{BatchedUpdates, LayerMap},
@@ -69,6 +70,7 @@ impl LayerManager {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
+        ctx: &RequestContext,
     ) -> Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
 
@@ -105,7 +107,7 @@ impl LayerManager {
             );
 
             let new_layer =
-                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
@@ -294,7 +296,7 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
         self.0
             .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.filename()))
+            .with_context(|| format!("get layer from desc: {}", desc.layer_name()))
             .expect("not found")
             .clone()
     }
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 991e4ac045..1d2ffec08f 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -705,6 +705,7 @@ impl ConnectionManagerState {
                     commit_lsn: info.commit_lsn,
                     safekeeper_connstr: info.safekeeper_connstr,
                     availability_zone: info.availability_zone,
+                    standby_horizon: info.standby_horizon,
                 }
             }
             MessageType::SafekeeperDiscoveryResponse => {
@@ -725,6 +726,21 @@ impl ConnectionManagerState {
 
         WALRECEIVER_BROKER_UPDATES.inc();
 
+        trace!(
+            "safekeeper info update: standby_horizon(cutoff)={}",
+            timeline_update.standby_horizon
+        );
+        if timeline_update.standby_horizon != 0 {
+            // ignore reports from safekeepers not connected to replicas
+            self.timeline
+                .standby_horizon
+                .store(Lsn(timeline_update.standby_horizon));
+            self.timeline
+                .metrics
+                .standby_horizon_gauge
+                .set(timeline_update.standby_horizon as i64);
+        }
+
         let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
         let old_entry = self.wal_stream_candidates.insert(
             new_safekeeper_id,
@@ -1094,6 +1110,7 @@ mod tests {
                 commit_lsn,
                 safekeeper_connstr: safekeeper_connstr.to_owned(),
                 availability_zone: None,
+                standby_horizon: 0,
             },
             latest_update,
         }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 0bf4d1e599..c0cc8f3124 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,12 +1,14 @@
-use super::storage_layer::LayerFileName;
+use super::storage_layer::LayerName;
 use super::storage_layer::ResidentLayer;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::remote_timeline_client::index::Lineage;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
 use chrono::NaiveDateTime;
+use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -45,7 +47,7 @@ pub(crate) struct UploadQueueInitialized {
 
     /// All layer files stored in the remote storage, taking into account all
     /// in-progress and queued operations
-    pub(crate) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(crate) latest_files: HashMap<LayerName, LayerFileMetadata>,
 
     /// How many file uploads or deletions been scheduled, since the
     /// last (scheduling of) metadata index upload?
@@ -56,6 +58,12 @@ pub(crate) struct UploadQueueInitialized {
     /// DANGER: do not return to outside world, e.g., safekeepers.
     pub(crate) latest_metadata: TimelineMetadata,
 
+    /// Part of the flattened "next" `index_part.json`.
+    pub(crate) latest_lineage: Lineage,
+
+    /// The last aux file policy used on this timeline.
+    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
+
     /// `disk_consistent_lsn` from the last metadata file that was successfully
     /// uploaded. `Lsn(0)` if nothing was uploaded yet.
     /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -89,7 +97,7 @@ pub(crate) struct UploadQueueInitialized {
     /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a
     /// bug causing leaks, then it's better to not leave this enabled for production builds.
     #[cfg(feature = "testing")]
-    pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
+    pub(crate) dangling_files: HashMap<LayerName, Generation>,
 
     /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
     pub(crate) shutting_down: bool,
@@ -171,6 +179,7 @@ impl UploadQueue {
             latest_files: HashMap::new(),
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: metadata.clone(),
+            latest_lineage: Lineage::default(),
             projected_remote_consistent_lsn: None,
             visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)),
             // what follows are boring default initializations
@@ -184,6 +193,7 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: Default::default(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -218,6 +228,7 @@ impl UploadQueue {
             latest_files: files,
             latest_files_changes_since_metadata_upload_scheduled: 0,
             latest_metadata: index_part.metadata.clone(),
+            latest_lineage: index_part.lineage.clone(),
             projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()),
             visible_remote_consistent_lsn: Arc::new(
                 index_part.metadata.disk_consistent_lsn().into(),
@@ -233,6 +244,7 @@ impl UploadQueue {
             dangling_files: HashMap::new(),
             shutting_down: false,
             shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
+            last_aux_file_policy: index_part.last_aux_file_policy(),
         };
 
         *self = UploadQueue::Initialized(state);
@@ -281,7 +293,7 @@ pub(crate) struct UploadTask {
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
+    pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>,
 }
 
 #[derive(Debug)]
@@ -290,7 +302,7 @@ pub(crate) enum UploadOp {
     UploadLayer(ResidentLayer, LayerFileMetadata),
 
     /// Upload the metadata file
-    UploadMetadata(IndexPart, Lsn),
+    UploadMetadata(Box<IndexPart>, Lsn),
 
     /// Delete layer files
     Delete(Delete),
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 91934d5e0e..6e825760e3 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -23,6 +23,7 @@ use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
+use crate::context::RequestContext;
 use crate::virtual_file::VirtualFile;
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -285,6 +286,7 @@ impl<'a> VectoredBlobReader<'a> {
         &self,
         read: &VectoredRead,
         buf: BytesMut,
+        ctx: &RequestContext,
     ) -> Result<VectoredBlobsBuf, std::io::Error> {
         assert!(read.size() > 0);
         assert!(
@@ -295,7 +297,7 @@ impl<'a> VectoredBlobReader<'a> {
         );
         let buf = self
             .file
-            .read_exact_at_n(buf, read.start, read.size())
+            .read_exact_at_n(buf, read.start, read.size(), ctx)
             .await?;
 
         let blobs_at = read.blobs_at.as_slice();
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index a17488a286..b68f3a0e89 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -344,16 +344,23 @@ macro_rules! with_file {
 
 impl VirtualFile {
     /// Open a file in read-only mode. Like File::open.
-    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
-        Self::open_with_options(path, OpenOptions::new().read(true)).await
+    pub async fn open(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
+        Self::open_with_options(path, OpenOptions::new().read(true), ctx).await
     }
 
     /// Create a new file for writing. If the file exists, it will be truncated.
     /// Like File::create.
-    pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn create(
+        path: &Utf8Path,
+        ctx: &RequestContext,
+    ) -> Result<VirtualFile, std::io::Error> {
         Self::open_with_options(
             path,
             OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
         )
         .await
     }
@@ -366,6 +373,7 @@ impl VirtualFile {
     pub async fn open_with_options(
         path: &Utf8Path,
         open_options: &OpenOptions,
+        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
     ) -> Result<VirtualFile, std::io::Error> {
         let path_str = path.to_string();
         let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -576,21 +584,34 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    pub async fn read_exact_at<B>(&self, buf: B, offset: u64) -> Result<B, Error>
+    pub async fn read_exact_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
     where
         B: IoBufMut + Send,
     {
-        let (buf, res) =
-            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
+            self.read_at(buf, offset, ctx)
+        })
+        .await;
         res.map(|()| buf)
     }
 
-    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    pub async fn read_exact_at_n<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        count: usize,
+        ctx: &RequestContext,
+    ) -> Result<B, Error>
     where
         B: IoBufMut + Send,
     {
         let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
-            self.read_at(buf, offset)
+            self.read_at(buf, offset, ctx)
         })
         .await;
         res.map(|()| buf)
@@ -601,12 +622,13 @@ impl VirtualFile {
         &self,
         page: PageWriteGuard<'static>,
         offset: u64,
+        ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, Error> {
         let buf = PageWriteGuardBuf {
             page,
             init_up_to: 0,
         };
-        let res = self.read_exact_at(buf, offset).await;
+        let res = self.read_exact_at(buf, offset, ctx).await;
         res.map(|PageWriteGuardBuf { page, .. }| page)
             .map_err(|e| Error::new(ErrorKind::Other, e))
     }
@@ -699,7 +721,12 @@ impl VirtualFile {
         (buf, Ok(n))
     }
 
-    pub(crate) async fn read_at<B>(&self, buf: B, offset: u64) -> (B, Result<usize, Error>)
+    pub(crate) async fn read_at<B>(
+        &self,
+        buf: B,
+        offset: u64,
+        _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
+    ) -> (B, Result<usize, Error>)
     where
         B: tokio_epoll_uring::BoundedBufMut + Send,
     {
@@ -1020,20 +1047,21 @@ impl VirtualFile {
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
+        ctx: &RequestContext,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
         let buf = vec![0; PAGE_SZ];
         let buf = self
-            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64))
+            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
             .await?;
         Ok(crate::tenant::block_io::BlockLease::Vec(buf))
     }
 
-    async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
         let mut tmp = vec![0; 128];
         loop {
             let res;
-            (tmp, res) = self.read_at(tmp, self.pos).await;
+            (tmp, res) = self.read_at(tmp, self.pos, ctx).await;
             match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
@@ -1159,7 +1187,6 @@ mod tests {
     use rand::seq::SliceRandom;
     use rand::thread_rng;
     use rand::Rng;
-    use std::future::Future;
     use std::io::Write;
     use std::os::unix::fs::FileExt;
     use std::sync::Arc;
@@ -1176,9 +1203,14 @@ mod tests {
     }
 
     impl MaybeVirtualFile {
-        async fn read_exact_at(&self, mut buf: Vec<u8>, offset: u64) -> Result<Vec<u8>, Error> {
+        async fn read_exact_at(
+            &self,
+            mut buf: Vec<u8>,
+            offset: u64,
+            ctx: &RequestContext,
+        ) -> Result<Vec<u8>, Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
                 MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
             }
         }
@@ -1230,13 +1262,13 @@ mod tests {
 
         // Helper function to slurp contents of a file, starting at the current position,
         // into a string
-        async fn read_string(&mut self) -> Result<String, Error> {
+        async fn read_string(&mut self, ctx: &RequestContext) -> Result<String, Error> {
             use std::io::Read;
             let mut buf = String::new();
             match self {
                 MaybeVirtualFile::VirtualFile(file) => {
                     let mut buf = Vec::new();
-                    file.read_to_end(&mut buf).await?;
+                    file.read_to_end(&mut buf, ctx).await?;
                     return Ok(String::from_utf8(buf).unwrap());
                 }
                 MaybeVirtualFile::File(file) => {
@@ -1247,9 +1279,14 @@ mod tests {
         }
 
         // Helper function to slurp a portion of a file into a string
-        async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
+        async fn read_string_at(
+            &mut self,
+            pos: u64,
+            len: usize,
+            ctx: &RequestContext,
+        ) -> Result<String, Error> {
             let buf = vec![0; len];
-            let buf = self.read_exact_at(buf, pos).await?;
+            let buf = self.read_exact_at(buf, pos, ctx).await?;
             Ok(String::from_utf8(buf).unwrap())
         }
     }
@@ -1263,73 +1300,101 @@ mod tests {
         // results with VirtualFiles as with native Files. (Except that with
         // native files, you will run out of file descriptors if the ulimit
         // is low enough.)
-        test_files("virtual_files", |path, open_options| async move {
-            let vf = VirtualFile::open_with_options(&path, &open_options).await?;
-            Ok(MaybeVirtualFile::VirtualFile(vf))
-        })
-        .await
+        struct A;
+
+        impl Adapter for A {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?;
+                Ok(MaybeVirtualFile::VirtualFile(vf))
+            }
+        }
+        test_files::<A>("virtual_files").await
     }
 
     #[tokio::test]
     async fn test_physical_files() -> anyhow::Result<()> {
-        test_files("physical_files", |path, open_options| async move {
-            Ok(MaybeVirtualFile::File({
-                let owned_fd = open_options.open(path.as_std_path()).await?;
-                File::from(owned_fd)
-            }))
-        })
-        .await
+        struct B;
+
+        impl Adapter for B {
+            async fn open(
+                path: Utf8PathBuf,
+                opts: OpenOptions,
+                _ctx: &RequestContext,
+            ) -> Result<MaybeVirtualFile, anyhow::Error> {
+                Ok(MaybeVirtualFile::File({
+                    let owned_fd = opts.open(path.as_std_path()).await?;
+                    File::from(owned_fd)
+                }))
+            }
+        }
+
+        test_files::<B>("physical_files").await
     }
 
-    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> anyhow::Result<()>
+    /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition
+    /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function
+    /// in trait which benefits from the new lifetime capture rules already.
+    trait Adapter {
+        async fn open(
+            path: Utf8PathBuf,
+            opts: OpenOptions,
+            ctx: &RequestContext,
+        ) -> Result<MaybeVirtualFile, anyhow::Error>;
+    }
+
+    async fn test_files<A>(testname: &str) -> anyhow::Result<()>
     where
-        OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
-        FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
+        A: Adapter,
     {
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
-        let mut file_a = openfunc(
+        let mut file_a = A::open(
             path_a.clone(),
             OpenOptions::new()
                 .write(true)
                 .create(true)
                 .truncate(true)
                 .to_owned(),
+            &ctx,
         )
         .await?;
         file_a.write_all(b"foobar".to_vec(), &ctx).await?;
 
         // cannot read from a file opened in write-only mode
-        let _ = file_a.read_string().await.unwrap_err();
+        let _ = file_a.read_string(&ctx).await.unwrap_err();
 
         // Close the file and re-open for reading
-        let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
+        let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?;
 
         // cannot write to a file opened in read-only mode
         let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err();
 
         // Try simple read
-        assert_eq!("foobar", file_a.read_string().await?);
+        assert_eq!("foobar", file_a.read_string(&ctx).await?);
 
         // It's positioned at the EOF now.
-        assert_eq!("", file_a.read_string().await?);
+        assert_eq!("", file_a.read_string(&ctx).await?);
 
         // Test seeks.
         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4);
-        assert_eq!("ar", file_a.read_string().await?);
+        assert_eq!("ar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1);
         assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3);
-        assert_eq!("bar", file_a.read_string().await?);
+        assert_eq!("bar", file_a.read_string(&ctx).await?);
 
         assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1);
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Test erroneous seeks to before byte 0
         file_a.seek(SeekFrom::End(-7)).await.unwrap_err();
@@ -1337,11 +1402,11 @@ mod tests {
         file_a.seek(SeekFrom::Current(-2)).await.unwrap_err();
 
         // the erroneous seek should have left the position unchanged
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Create another test file, and try FileExt functions on it.
         let path_b = testdir.join("file_b");
-        let mut file_b = openfunc(
+        let mut file_b = A::open(
             path_b.clone(),
             OpenOptions::new()
                 .read(true)
@@ -1349,12 +1414,13 @@ mod tests {
                 .create(true)
                 .truncate(true)
                 .to_owned(),
+            &ctx,
         )
         .await?;
         file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?;
         file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?;
 
-        assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");
+        assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
 
         // Open a lot of files, enough to cause some evictions. (Or to be precise,
         // open the same file many times. The effect is the same.)
@@ -1364,9 +1430,13 @@ mod tests {
 
         let mut vfiles = Vec::new();
         for _ in 0..100 {
-            let mut vfile =
-                openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?;
-            assert_eq!("FOOBAR", vfile.read_string().await?);
+            let mut vfile = A::open(
+                path_b.clone(),
+                OpenOptions::new().read(true).to_owned(),
+                &ctx,
+            )
+            .await?;
+            assert_eq!("FOOBAR", vfile.read_string(&ctx).await?);
             vfiles.push(vfile);
         }
 
@@ -1375,13 +1445,13 @@ mod tests {
 
         // The underlying file descriptor for 'file_a' should be closed now. Try to read
         // from it again. We left the file positioned at offset 1 above.
-        assert_eq!("oobar", file_a.read_string().await?);
+        assert_eq!("oobar", file_a.read_string(&ctx).await?);
 
         // Check that all the other FDs still work too. Use them in random order for
         // good measure.
         vfiles.as_mut_slice().shuffle(&mut thread_rng());
         for vfile in vfiles.iter_mut() {
-            assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?);
+            assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?);
         }
 
         Ok(())
@@ -1397,6 +1467,7 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
@@ -1410,8 +1481,12 @@ mod tests {
         // Open the file many times.
         let mut files = Vec::new();
         for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))
-                .await?;
+            let f = VirtualFile::open_with_options(
+                &test_file_path,
+                OpenOptions::new().read(true),
+                &ctx,
+            )
+            .await?;
             files.push(f);
         }
         let files = Arc::new(files);
@@ -1425,12 +1500,13 @@ mod tests {
         let mut hdls = Vec::new();
         for _threadno in 0..THREADS {
             let files = files.clone();
+            let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
             let hdl = rt.spawn(async move {
                 let mut buf = vec![0u8; SIZE];
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    buf = f.read_exact_at(buf, 0).await.unwrap();
+                    buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
                     assert!(buf == SAMPLE);
                 }
             });
@@ -1446,6 +1522,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_atomic_overwrite_basic() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
         std::fs::create_dir_all(&testdir).unwrap();
 
@@ -1455,8 +1532,8 @@ mod tests {
         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
         drop(file);
@@ -1464,8 +1541,8 @@ mod tests {
         VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "bar");
         assert!(!tmp_path.exists());
         drop(file);
@@ -1473,6 +1550,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_atomic_overwrite_preexisting_tmp() {
+        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let testdir =
             crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
         std::fs::create_dir_all(&testdir).unwrap();
@@ -1487,8 +1565,8 @@ mod tests {
             .await
             .unwrap();
 
-        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
-        let post = file.read_string().await.unwrap();
+        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
+        let post = file.read_string(&ctx).await.unwrap();
         assert_eq!(post, "foo");
         assert!(!tmp_path.exists());
         drop(file);
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 9776d4ce88..3decea0c6d 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -153,10 +153,7 @@ impl PostgresRedoManager {
             process: self
                 .redo_process
                 .get()
-                .map(|p| WalRedoManagerProcessStatus {
-                    pid: p.id(),
-                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
-                }),
+                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
         }
     }
 }
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index ad6b4e5fe9..02c9c04bf1 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,7 +1,10 @@
+/// Layer of indirection previously used to support multiple implementations.
+/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
 use std::time::Duration;
 
 use bytes::Bytes;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use tracing::warn;
 use utils::lsn::Lsn;
 
 use crate::{config::PageServerConf, walrecord::NeonWalRecord};
@@ -12,7 +15,6 @@ mod protocol;
 
 mod process_impl {
     pub(super) mod process_async;
-    pub(super) mod process_std;
 }
 
 #[derive(
@@ -34,10 +36,7 @@ pub enum Kind {
     Async,
 }
 
-pub(crate) enum Process {
-    Sync(process_impl::process_std::WalRedoProcess),
-    Async(process_impl::process_async::WalRedoProcess),
-}
+pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
 
 impl Process {
     #[inline(always)]
@@ -46,18 +45,17 @@ impl Process {
         tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
-        Ok(match conf.walredo_process_kind {
-            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
-                conf,
-                tenant_shard_id,
-                pg_version,
-            )?),
-        })
+        if conf.walredo_process_kind != Kind::Async {
+            warn!(
+                configured = %conf.walredo_process_kind,
+                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
+            );
+        }
+        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
+            conf,
+            tenant_shard_id,
+            pg_version,
+        )?))
     }
 
     #[inline(always)]
@@ -69,29 +67,12 @@ impl Process {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
     ) -> anyhow::Result<Bytes> {
-        match self {
-            Process::Sync(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-            Process::Async(p) => {
-                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-                    .await
-            }
-        }
+        self.0
+            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+            .await
     }
 
     pub(crate) fn id(&self) -> u32 {
-        match self {
-            Process::Sync(p) => p.id(),
-            Process::Async(p) => p.id(),
-        }
-    }
-
-    pub(crate) fn kind(&self) -> Kind {
-        match self {
-            Process::Sync(_) => Kind::Sync,
-            Process::Async(_) => Kind::Async,
-        }
+        self.0.id()
     }
 }
diff --git a/pageserver/src/walredo/process/process_impl/process_std.rs b/pageserver/src/walredo/process/process_impl/process_std.rs
deleted file mode 100644
index e7a6c263c9..0000000000
--- a/pageserver/src/walredo/process/process_impl/process_std.rs
+++ /dev/null
@@ -1,405 +0,0 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-    walredo::process::{no_leak_child, protocol},
-};
-use anyhow::Context;
-use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
-use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
-
-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
-}
-
-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
-}
-
-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
-}
-
-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-        pg_version: u32,
-    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-            async move {
-                scopeguard::defer! {
-                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-                }
-                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
-                        }
-                        Err(e) => {
-                            break Err(e);
-                        }
-                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
-                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-        );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
-        })
-    }
-
-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) async fn apply_wal_records(
-        &self,
-        rel: RelTag,
-        blknum: u32,
-        base_img: &Option<Bytes>,
-        records: &[(Lsn, NeonWalRecord)],
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
-            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
-            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
-        }
-    }
-
-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
-
-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
-    }
-}
diff --git a/patches/pgvector.patch b/patches/pgvector.patch
new file mode 100644
index 0000000000..84ac6644c5
--- /dev/null
+++ b/patches/pgvector.patch
@@ -0,0 +1,78 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..ec54dea 100644
+--- a/src/hnswbuild.c
++++ b/src/hnswbuild.c
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
++#endif
++
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
++#ifdef NEON_SMGR
++	smgr_start_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
++#ifdef NEON_SMGR
++	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
++#endif
++
+ 	if (RelationNeedsWAL(index))
++	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
++#ifdef NEON_SMGR
++		{
++#if PG_VERSION_NUM >= 160000
++			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
++#else
++			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
++#endif
++
++			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
++										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
++			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
++		}
++#endif
++	}
++
++#ifdef NEON_SMGR
++	smgr_end_unlogged_build(RelationGetSmgr(index));
++#endif
++
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 7709ab9d42..8951e6607b 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -237,18 +237,50 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);
 
+/*
+ * LSN values associated with each request to the pageserver
+ */
+typedef struct
+{
+	/*
+	 * 'request_lsn' is the main value that determines which page version to
+	 * fetch.
+	 */
+	XLogRecPtr request_lsn;
+
+	/*
+	 * A hint to the pageserver that the requested page hasn't been modified
+	 * between this LSN and 'request_lsn'. That allows the pageserver to
+	 * return the page faster, without waiting for 'request_lsn' to arrive in
+	 * the pageserver, as long as 'not_modified_since' has arrived.
+	 */
+	XLogRecPtr not_modified_since;
+
+	/*
+	 * 'effective_request_lsn' is not included in the request that's sent to
+	 * the pageserver, but is used to keep track of the latest LSN of when the
+	 * request was made. In a standby server, this is always the same as the
+	 * 'request_lsn', but in the primary we use UINT64_MAX as the
+	 * 'request_lsn' to request the latest page version, so we need this
+	 * separate field to remember that latest LSN was when the request was
+	 * made. It's needed to manage prefetch request, to verify if the response
+	 * to a prefetched request is still valid.
+	 */
+	XLogRecPtr effective_request_lsn;
+} neon_request_lsns;
+
 #if PG_MAJORVERSION_NUM < 16
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+										 neon_request_lsns request_lsns, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+										 neon_request_lsns request_lsns, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 44ecdbd9aa..41546eae85 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,6 +45,7 @@
  */
 #include "postgres.h"
 
+#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -168,8 +169,7 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -271,16 +271,15 @@ static PrefetchState *MyPState;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
+static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);
 
-static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
-static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
+static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
 										  PrefetchRequest *slot);
 
 static bool
@@ -338,8 +337,7 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->request_lsn = source_slot->request_lsn;
-		target_slot->not_modified_since = source_slot->not_modified_since;
+		target_slot->request_lsns = source_slot->request_lsns;
 		target_slot->my_ring_index = empty_ring_index;
 
 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -358,8 +356,9 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->request_lsn = InvalidXLogRecPtr;
-		source_slot->not_modified_since = InvalidXLogRecPtr;
+		source_slot->request_lsns = (neon_request_lsns) {
+			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
+		};
 
 		/* update bookkeeping */
 		n_moved++;
@@ -689,7 +688,7 @@ prefetch_set_unused(uint64 ring_index)
  * prefetch_wait_for().
  */
 static void
-prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
+prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
 	NeonGetPageRequest request = {
@@ -700,23 +699,14 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
 		.blkno = slot->buftag.blockNum,
 	};
 
-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
-	if (force_request_lsn)
-	{
-		request.req.lsn = *force_request_lsn;
-		request.req.not_modified_since = *force_not_modified_since;
-	}
+	if (force_request_lsns)
+		slot->request_lsns = *force_request_lsns;
 	else
-	{
-		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
-							 slot->buftag.forkNum,
-							 slot->buftag.blockNum,
-							 &request.req.lsn,
-							 &request.req.not_modified_since);
-	}
-	slot->request_lsn = request.req.lsn;
-	slot->not_modified_since = request.req.not_modified_since;
+		slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+												   slot->buftag.forkNum,
+												   slot->buftag.blockNum);
+	request.req.lsn = slot->request_lsns.request_lsn;
+	request.req.not_modified_since = slot->request_lsns.not_modified_since;
 
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
@@ -742,25 +732,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
  *
  * Register that we may want the contents of BufferTag in the near future.
  *
- * If force_request_lsn and force_not_modified_since are not NULL, those
- * values are sent to the pageserver. If they are NULL, we utilize the
- * lastWrittenLsn -infrastructure to fill them in.
+ * If force_request_lsns is not NULL, those values are sent to the
+ * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
+ * to calculate the LSNs to send.
  *
  * NOTE: this function may indirectly update MyPState->pfs_hash; which
  * invalidates any active pointers into the hash table.
  */
 
 static uint64
-prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
-						 XLogRecPtr *force_not_modified_since)
+prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;
 
-	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
-
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -781,10 +768,9 @@ Retry:
 		 * If the caller specified a request LSN to use, only accept prefetch
 		 * responses that satisfy that request.
 		 */
-		if (force_request_lsn)
+		if (force_request_lsns)
 		{
-			if (!neon_prefetch_response_usable(*force_request_lsn,
-											   *force_not_modified_since, slot))
+			if (!neon_prefetch_response_usable(*force_request_lsns, slot))
 			{
 				/* Wait for the old request to finish and discard it */
 				if (!prefetch_wait_for(ring_index))
@@ -886,7 +872,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;
 
-	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
+	prefetch_do_request(slot, force_request_lsns);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -1363,6 +1349,10 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }
 
+/*
+ * A page is being evicted from the shared buffer cache. Update the
+ * last-written LSN of the page, and WAL-log it if needed.
+ */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1371,12 +1361,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
-
-	if (ShutdownRequestPending)
-		return;
-	/* Don't log any pages if we're not allowed to do so. */
-	if (!XLogInsertAllowed())
-		return;
+	bool		log_page;
 
 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1385,9 +1370,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
+	log_page = false;
+	if (force)
+	{
+		Assert(XLogInsertAllowed());
+		log_page = true;
+	}
+	else if (XLogInsertAllowed() &&
+			 !ShutdownRequestPending &&
+			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	{
+		log_page = true;
+	}
+
+	if (log_page)
 	{
-		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;
 
 		recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1400,7 +1397,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-	else if (lsn == InvalidXLogRecPtr)
+
+	if (lsn == InvalidXLogRecPtr)
 	{
 		/*
 		 * When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1436,19 +1434,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		else
+		else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
 		{
-			ereport(PANIC,
+			/*
+			 * Its a bad sign if there is a page with zero LSN in the buffer
+			 * cache in a standby, too. However, PANICing seems like a cure
+			 * worse than the disease, as the damage has likely already been
+			 * done in the primary. So in a standby, make this an assertion,
+			 * and in a release build just LOG the error and soldier on. We
+			 * update the last-written LSN of the page with a conservative
+			 * value in that case, which is the last replayed LSN.
+			 */
+			ereport(RecoveryInProgress() ? LOG : PANIC,
 					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
+			Assert(false);
+
+			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1529,11 +1539,11 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
  * Return LSN for requesting pages and number of blocks from page server
  */
-static void
-neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
+static neon_request_lsns
+neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 {
 	XLogRecPtr	last_written_lsn;
+	neon_request_lsns result;
 
 	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 	last_written_lsn = nm_adjust_lsn(last_written_lsn);
@@ -1541,13 +1551,98 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 
 	if (RecoveryInProgress())
 	{
-		/* Request the page at the last replayed LSN. */
-		*request_lsn = GetXLogReplayRecPtr(NULL);
-		*not_modified_since = last_written_lsn;
-		Assert(last_written_lsn <= *request_lsn);
+		/*---
+		 * In broad strokes, a replica always requests the page at the current
+		 * replay LSN. But looking closer, what exactly is the replay LSN? Is
+		 * it the last replayed record, or the record being replayed? And does
+		 * the startup process performing the replay need to do something
+		 * differently than backends running queries? Let's take a closer look
+		 * at the different scenarios:
+		 *
+		 * 1. Startup process reads a page, last_written_lsn is old.
+		 *
+		 * Read the old version of the page. We will apply the WAL record on
+		 * it to bring it up-to-date.
+		 *
+		 * We could read the new version, with the changes from this WAL
+		 * record already applied, to offload the work of replaying the record
+		 * to the pageserver. The pageserver might not have received the WAL
+		 * record yet, though, so a read of the old page version and applying
+		 * the record ourselves is likely faster. Also, the redo function
+		 * might be surprised if the changes have already applied. That's
+		 * normal during crash recovery, but not in hot standby.
+		 *
+		 * 2. Startup process reads a page, last_written_lsn == record we're
+		 *    replaying.
+		 *
+		 * Can this happen? There are a few theoretical cases when it might:
+		 *
+		 * A) The redo function reads the same page twice. We had already read
+		 *    and applied the changes once, and now we're reading it for the
+		 *    second time.  That would be a rather silly thing for a redo
+		 *    function to do, and I'm not aware of any that would do it.
+		 *
+		 * B) The redo function modifies multiple pages, and it already
+		 *    applied the changes to one of the pages, released the lock on
+		 *    it, and is now reading a second page.  Furthermore, the first
+		 *    page was already evicted from the buffer cache, and also from
+		 *    the last-written LSN cache, so that the per-relation or global
+		 *    last-written LSN was already updated. All the WAL redo functions
+		 *    hold the locks on pages that they modify, until all the changes
+		 *    have been modified (?), which would make that impossible.
+		 *    However, we skip the locking, if the page isn't currently in the
+		 *    page cache (see neon_redo_read_buffer_filter below).
+		 *
+		 * Even if the one of the above cases were possible in theory, they
+		 * would also require the pages being modified by the redo function to
+		 * be immediately evicted from the page cache.
+		 *
+		 * So this probably does not happen in practice. But if it does, we
+		 * request the new version, including the changes from the record
+		 * being replayed. That seems like the correct behavior in any case.
+		 *
+		 * 3. Backend process reads a page with old last-written LSN
+		 *
+		 * Nothing special here. Read the old version.
+		 *
+		 * 4. Backend process reads a page with last_written_lsn == record being replayed
+		 *
+		 * This can happen, if the redo function has started to run, and saw
+		 * that the page isn't present in the page cache (see
+		 * neon_redo_read_buffer_filter below).  Normally, in a normal
+		 * Postgres server, the redo function would hold a lock on the page,
+		 * so we would get blocked waiting the redo function to release the
+		 * lock. To emulate that, wait for the WAL replay of the record to
+		 * finish.
+		 */
+		/* Request the page at the end of the last fully replayed LSN. */
+		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
 
-		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
+		if (last_written_lsn > replay_lsn)
+		{
+			/* GetCurrentReplayRecPtr was introduced in v15 */
+#if PG_VERSION_NUM >= 150000
+			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
+#endif
+
+			/*
+			 * Cases 2 and 4. If this is a backend (case 4), the
+			 * neon_read_at_lsn() call later will wait for the WAL record to be
+			 * fully replayed.
+			 */
+			result.request_lsn = last_written_lsn;
+		}
+		else
+		{
+			/* cases 1 and 3 */
+			result.request_lsn = replay_lsn;
+		}
+		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = result.request_lsn;
+		Assert(last_written_lsn <= result.request_lsn);
+
+		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
 	}
 	else
 	{
@@ -1559,7 +1654,7 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * must still in the buffer cache, so our request cannot concern
 		 * those.
 		 */
-		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
+		neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
 				 LSN_FORMAT_ARGS(last_written_lsn));
 
 		/*
@@ -1585,16 +1680,33 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		}
 
 		/*
-		 * Request the latest version of the page. The most up-to-date request
-		 * LSN we could use would be the current insert LSN, but to avoid the
-		 * overhead of looking it up, use 'flushlsn' instead. This relies on
-		 * the assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we wouldn't be
-		 * requesting it.
+		 * Request the very latest version of the page. In principle we
+		 * want to read the page at the current insert LSN, and we could
+		 * use that value in the request. However, there's a corner case
+		 * with pageserver's garbage collection. If the GC horizon is
+		 * set to a very small value, it's possible that by the time
+		 * that the pageserver processes our request, the GC horizon has
+		 * already moved past the LSN we calculate here. Standby servers
+		 * always have that problem as the can always lag behind the
+		 * primary, but for the primary we can avoid it by always
+		 * requesting the latest page, by setting request LSN to
+		 * UINT64_MAX.
+		 *
+		 * Remember the current LSN, however, so that we can later
+		 * correctly determine if the response to the request is still
+		 * valid. The most up-to-date LSN we could use for that purpose
+		 * would be the current insert LSN, but to avoid the overhead of
+		 * looking it up, use 'flushlsn' instead. This relies on the
+		 * assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we
+		 * wouldn't be requesting it.
 		 */
-		*request_lsn = flushlsn;
-		*not_modified_since = last_written_lsn;
+		result.request_lsn = UINT64_MAX;
+		result.not_modified_since = last_written_lsn;
+		result.effective_request_lsn = flushlsn;
 	}
+
+	return result;
 }
 
 /*
@@ -1604,12 +1716,16 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
  * satisfy a page read now.
  */
 static bool
-neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
+neon_prefetch_response_usable(neon_request_lsns request_lsns,
 							  PrefetchRequest *slot)
 {
 	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsn >= not_modified_since);
-	Assert(slot->request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
+	Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
+	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
 	Assert(slot->status != PRFS_UNUSED);
 
 	/*
@@ -1627,26 +1743,40 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
+	if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
+		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
+						   LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns.not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
 		return false;
 	}
 
 	/*---
-	 * Each request to the pageserver carries two LSN values:
-	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
-	 * request_lsn] range of each request is effectively a claim that the page
-	 * has not been modified between those LSNs.  If the range of the old
-	 * request in the queue overlaps with the new request, we know that the
-	 * page hasn't been modified in the union of the ranges. We can use the
-	 * response to old request to satisfy the new request in that case. For
-	 * example:
+	 * Each request to the pageserver has three LSN values associated with it:
+	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
+	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
+	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
+	 * we remember `effective_request_lsn` separately. In a primary,
+	 * `effective_request_lsn` is the last flush WAL position when the request
+	 * was sent to the pageserver. That's logically the LSN that we are
+	 * requesting the page at, but we send UINT64_MAX to the pageserver so
+	 * that if the GC horizon advances past that position, we still get a
+	 * valid response instead of an error.
+	 *
+	 * To determine whether a response to a GetPage request issued earlier is
+	 * still valid to satisfy a new page read, we look at the
+	 * (not_modified_since, effective_request_lsn] range of the request. It is
+	 * effectively a claim that the page has not been modified between those
+	 * LSNs.  If the range of the old request in the queue overlaps with the
+	 * new request, we know that the page hasn't been modified in the union of
+	 * the ranges. We can use the response to old request to satisfy the new
+	 * request in that case. For example:
 	 *
 	 *              100      500
 	 * Old request:  +--------+
@@ -1675,9 +1805,9 @@ neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_si
 	 */
 
 	/* this follows from the checks above */
-	Assert(request_lsn >= slot->not_modified_since);
+	Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
 
-	return not_modified_since <= slot->request_lsn;
+	return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
 }
 
 /*
@@ -1689,8 +1819,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -1745,15 +1874,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
-		.forknum = forkNum};
+			.forknum = forkNum
+		};
 
 		resp = page_server_request(&request);
 	}
@@ -1770,7 +1899,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2135,7 +2264,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
-	ring_index = prefetch_register_buffer(tag, NULL, NULL);
+	ring_index = prefetch_register_buffer(tag, NULL);
 
 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2188,10 +2317,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
+				 neon_request_lsns request_lsns, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
+				 neon_request_lsns request_lsns, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2223,7 +2352,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * value of the LwLsn cache when the entry is not found.
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
-		XLogWaitForReplayOf(request_lsn);
+		XLogWaitForReplayOf(request_lsns.request_lsn);
 
 	/*
 	 * Try to find prefetched page in the list of received pages.
@@ -2234,7 +2363,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
+		if (neon_prefetch_response_usable(request_lsns, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
@@ -2268,8 +2397,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		{
 			pgBufferUsage.prefetch.misses += 1;
 
-			ring_index = prefetch_register_buffer(buftag, &request_lsn,
-												  &not_modified_since);
+			ring_index = prefetch_register_buffer(buftag, &request_lsns);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2310,7 +2438,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2333,8 +2461,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2359,9 +2486,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
-						 &request_lsn, &not_modified_since);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2530,8 +2656,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns request_lsns;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -2558,13 +2683,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}
 
-	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2584,7 +2708,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2595,10 +2719,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-		 forknum,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 n_blocks);
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+			 forknum,
+			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+			 n_blocks);
 
 	pfree(resp);
 	return n_blocks;
@@ -2612,17 +2736,15 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	XLogRecPtr	request_lsn,
-				not_modified_since;
+	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
 
-	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
-						 &request_lsn, &not_modified_since);
+	request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.lsn = request_lsn,
-			.req.not_modified_since = not_modified_since,
+			.req.lsn = request_lsns.request_lsn,
+			.req.not_modified_since = request_lsns.not_modified_since,
 			.dbNode = dbNode,
 		};
 
@@ -2639,8 +2761,7 @@ neon_dbsize(Oid dbNode)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode,
-							(uint32) (request_lsn >> 32), (uint32) request_lsn),
+							dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
@@ -2650,9 +2771,7 @@ neon_dbsize(Oid dbNode)
 	}
 
 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-		 dbNode,
-		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-		 db_size);
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
 
 	pfree(resp);
 	return db_size;
@@ -2812,10 +2931,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
 
 	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
 }
 
 /*
@@ -2839,7 +2962,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
 
-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }
 
 /*
@@ -2897,6 +3030,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	XLogRecPtr request_lsn,
 		not_modified_since;
 
+	/*
+	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
+	 * logic is a bit simpler.
+	 */
 	if (RecoveryInProgress())
 	{
 		request_lsn = GetXLogReplayRecPtr(NULL);
@@ -2908,10 +3045,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			 */
 			request_lsn = GetRedoStartLsn();
 		}
+		request_lsn = nm_adjust_lsn(request_lsn);
 	}
 	else
-		request_lsn = GetXLogInsertRecPtr();
-	request_lsn = nm_adjust_lsn(request_lsn);
+		request_lsn = UINT64_MAX;
 
 	/*
 	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
@@ -3187,7 +3324,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
-	Buffer		buffer;
+	int			buf_id;
 	bool		no_redo_needed;
 
 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3225,20 +3362,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/* Try to find the relevant buffer */
-		buffer = BufTableLookup(&tag, hash);
+		buf_id = BufTableLookup(&tag, hash);
 
-		no_redo_needed = buffer < 0;
+		no_redo_needed = buf_id < 0;
 	}
-	/* In both cases st lwlsn past this WAL record */
-	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 
 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
 	 * evict page from file cache
 	 */
 	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 		lfc_evict(rinfo, forknum, blkno);
-
+	}
 
 	LWLockRelease(partitionLock);
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index e5ef93b456..316e23a72e 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1852,34 +1852,30 @@ static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 {
 	hs->ts = 0;
-	hs->xmin.value = ~0;		/* largest unsigned value */
-	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
+	hs->xmin = InvalidFullTransactionId;
+	hs->catalog_xmin = InvalidFullTransactionId;
 
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
-		if (wp->safekeeper[i].appendResponse.hs.ts != 0)
+
+		if (wp->safekeeper[i].state == SS_ACTIVE)
 		{
 			HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;
 
 			if (FullTransactionIdIsNormal(skhs->xmin)
-				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
 			{
 				hs->xmin = skhs->xmin;
 				hs->ts = skhs->ts;
 			}
 			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
-				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
 			{
 				hs->catalog_xmin = skhs->catalog_xmin;
 				hs->ts = skhs->ts;
 			}
 		}
 	}
-
-	if (hs->xmin.value == ~0)
-		hs->xmin = InvalidFullTransactionId;
-	if (hs->catalog_xmin.value == ~0)
-		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
 /*
@@ -1946,14 +1942,28 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
+	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
+		FullTransactionId xmin = hsFeedback.xmin;
+		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
+		FullTransactionId next_xid = ReadNextFullTransactionId();
+		/*
+		 * Page server is updating nextXid in checkpoint each 1024 transactions,
+		 * so feedback xmin can be actually larger then nextXid and
+		 * function TransactionIdInRecentPast return false in this case,
+		 * preventing update of slot's xmin.
+		 */
+		if (FullTransactionIdPrecedes(next_xid, xmin))
+			xmin = next_xid;
+		if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
+			catalog_xmin = next_xid;
 		agg_hs_feedback = hsFeedback;
+		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(hsFeedback.xmin),
-								 EpochFromFullTransactionId(hsFeedback.xmin),
-								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+								 XidFromFullTransactionId(xmin),
+								 EpochFromFullTransactionId(xmin),
+								 XidFromFullTransactionId(catalog_xmin),
+								 EpochFromFullTransactionId(catalog_xmin));
 	}
 
 	CheckGracefulShutdown(wp);
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 677006923d..47f245fbf1 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
  */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
+									   neon_request_lsns request_lsns, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
+									   neon_request_lsns request_lsns, void *buffer);
 #endif
 
 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -298,9 +298,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *relname;
 	text	   *forkname;
 	uint32		blkno;
-
-	XLogRecPtr	request_lsn;
-	XLogRecPtr	not_modified_since;
+	neon_request_lsns	request_lsns;
 
 	if (PG_NARGS() != 5)
 		elog(ERROR, "unexpected number of arguments in SQL function signature");
@@ -312,8 +310,15 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);
 
-	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
-	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);
+	request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4);
+	/*
+	 * For the time being, use the same LSN for request and
+	 * effective request LSN. If any test needed to use UINT64_MAX
+	 * as the request LSN, we'd need to add effective_request_lsn
+	 * as a new argument.
+	 */
+	request_lsns.effective_request_lsn = request_lsns.request_lsn;
 
 	if (!superuser())
 		ereport(ERROR,
@@ -367,7 +372,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);
 
-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns,
+					 raw_page_data);
 
 	relation_close(rel, AccessShareLock);
 
@@ -413,19 +419,25 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 		ForkNumber	forknum = PG_GETARG_UINT32(3);
 		uint32		blkno = PG_GETARG_UINT32(4);
-		XLogRecPtr	request_lsn;
-		XLogRecPtr	not_modified_since;
+		neon_request_lsns	request_lsns;
 
 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
 
-		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
-		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);
+		request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6);
+		/*
+		 * For the time being, use the same LSN for request
+		 * and effective request LSN. If any test needed to
+		 * use UINT64_MAX as the request LSN, we'd need to add
+		 * effective_request_lsn as a new argument.
+		 */
+		request_lsns.effective_request_lsn = request_lsns.request_lsn;
 
 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);
 
-		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
diff --git a/poetry.lock b/poetry.lock
index e437f5de74..25c0c7398d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -158,6 +158,28 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
 
+[[package]]
+name = "annotated-types"
+version = "0.6.0"
+description = "Reusable constraint types to use with typing.Annotated"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"},
+    {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"},
+]
+
+[[package]]
+name = "antlr4-python3-runtime"
+version = "4.13.1"
+description = "ANTLR 4.13.1 runtime for Python 3"
+optional = false
+python-versions = "*"
+files = [
+    {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"},
+    {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"},
+]
+
 [[package]]
 name = "anyio"
 version = "4.3.0"
@@ -267,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy"
 
 [[package]]
 name = "aws-sam-translator"
-version = "1.48.0"
+version = "1.88.0"
 description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates"
 optional = false
-python-versions = ">=3.7, <=4.0, !=4.0"
+python-versions = "!=4.0,<=4.0,>=3.8"
 files = [
-    {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"},
-    {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"},
-    {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"},
+    {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"},
+    {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"},
 ]
 
 [package.dependencies]
 boto3 = ">=1.19.5,<2.dev0"
-jsonschema = ">=3.2,<4.0"
+jsonschema = ">=3.2,<5"
+pydantic = ">=1.8,<3"
+typing-extensions = ">=4.4"
 
 [package.extras]
-dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"]
+dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"]
 
 [[package]]
 name = "aws-xray-sdk"
@@ -798,24 +821,26 @@ pycparser = "*"
 
 [[package]]
 name = "cfn-lint"
-version = "0.61.3"
+version = "0.87.1"
 description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved"
 optional = false
-python-versions = ">=3.6, <=4.0, !=4.0"
+python-versions = "!=4.0,<=4.0,>=3.8"
 files = [
-    {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"},
-    {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"},
+    {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"},
+    {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"},
 ]
 
 [package.dependencies]
-aws-sam-translator = ">=1.47.0"
+aws-sam-translator = ">=1.87.0"
 jschema-to-python = ">=1.2.3,<1.3.0"
 jsonpatch = "*"
-jsonschema = ">=3.0,<4.0"
+jsonschema = ">=3.0,<5"
 junit-xml = ">=1.9,<2.0"
-networkx = ">=2.4,<3.0"
+networkx = ">=2.4,<4"
 pyyaml = ">5.4"
+regex = ">=2021.7.1"
 sarif-om = ">=1.0.4,<1.1.0"
+sympy = ">=1.0.0"
 
 [[package]]
 name = "charset-normalizer"
@@ -931,24 +956,6 @@ websocket-client = ">=0.32.0"
 ssh = ["paramiko (>=2.4.2)"]
 tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"]
 
-[[package]]
-name = "ecdsa"
-version = "0.18.0"
-description = "ECDSA cryptographic signature library (pure python)"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
-files = [
-    {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"},
-    {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"},
-]
-
-[package.dependencies]
-six = ">=1.9.0"
-
-[package.extras]
-gmpy = ["gmpy"]
-gmpy2 = ["gmpy2"]
-
 [[package]]
 name = "exceptiongroup"
 version = "1.1.1"
@@ -1268,6 +1275,23 @@ files = [
     {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"},
 ]
 
+[[package]]
+name = "joserfc"
+version = "0.9.0"
+description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"},
+    {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"},
+]
+
+[package.dependencies]
+cryptography = "*"
+
+[package.extras]
+drafts = ["pycryptodome"]
+
 [[package]]
 name = "jschema-to-python"
 version = "1.2.3"
@@ -1309,6 +1333,20 @@ files = [
 [package.dependencies]
 jsonpointer = ">=1.9"
 
+[[package]]
+name = "jsonpath-ng"
+version = "1.6.1"
+description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming."
+optional = false
+python-versions = "*"
+files = [
+    {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"},
+    {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"},
+]
+
+[package.dependencies]
+ply = "*"
+
 [[package]]
 name = "jsonpickle"
 version = "2.2.0"
@@ -1338,24 +1376,39 @@ files = [
 
 [[package]]
 name = "jsonschema"
-version = "3.2.0"
+version = "4.17.3"
 description = "An implementation of JSON Schema validation for Python"
 optional = false
-python-versions = "*"
+python-versions = ">=3.7"
 files = [
-    {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"},
-    {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"},
+    {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"},
+    {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"},
 ]
 
 [package.dependencies]
 attrs = ">=17.4.0"
-pyrsistent = ">=0.14.0"
-setuptools = "*"
-six = ">=1.11.0"
+pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2"
 
 [package.extras]
-format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"]
+format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"]
+
+[[package]]
+name = "jsonschema-spec"
+version = "0.1.6"
+description = "JSONSchema Spec with object-oriented paths"
+optional = false
+python-versions = ">=3.7.0,<4.0.0"
+files = [
+    {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"},
+    {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"},
+]
+
+[package.dependencies]
+jsonschema = ">=4.0.0,<4.18.0"
+pathable = ">=0.4.1,<0.5.0"
+PyYAML = ">=5.1"
+requests = ">=2.31.0,<3.0.0"
 
 [[package]]
 name = "junit-xml"
@@ -1371,6 +1424,52 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "lazy-object-proxy"
+version = "1.10.0"
+description = "A fast and thorough lazy object proxy."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"},
+    {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"},
+    {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"},
+    {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"},
+    {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"},
+    {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"},
+    {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
+]
+
 [[package]]
 name = "markupsafe"
 version = "2.1.1"
@@ -1422,64 +1521,80 @@ files = [
 
 [[package]]
 name = "moto"
-version = "4.1.2"
+version = "5.0.6"
 description = ""
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
-    {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
+    {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"},
+    {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"},
 ]
 
 [package.dependencies]
+antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""}
 aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
 boto3 = ">=1.9.201"
-botocore = ">=1.12.201"
+botocore = ">=1.14.0"
 cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
 cryptography = ">=3.3.1"
-docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
-ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
+docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""}
 flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
 Jinja2 = ">=2.10.1"
+joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""}
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
-openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
+jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""}
+openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""}
+py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""}
 pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
-python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
-responses = ">=0.13.0"
+responses = ">=0.15.0"
 setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
-sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
 xmltodict = "*"
 
 [package.extras]
-all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
-apigatewayv2 = ["PyYAML (>=5.1)"]
+all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"]
+apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"]
 appsync = ["graphql-core"]
-awslambda = ["docker (>=2.5.1)"]
-batch = ["docker (>=2.5.1)"]
-cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
-ds = ["sshpubkeys (>=3.1.0)"]
-dynamodb = ["docker (>=2.5.1)"]
-dynamodbstreams = ["docker (>=2.5.1)"]
-ebs = ["sshpubkeys (>=3.1.0)"]
-ec2 = ["sshpubkeys (>=3.1.0)"]
-efs = ["sshpubkeys (>=3.1.0)"]
-eks = ["sshpubkeys (>=3.1.0)"]
+awslambda = ["docker (>=3.0.0)"]
+batch = ["docker (>=3.0.0)"]
+cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+cognitoidp = ["joserfc (>=0.9.0)"]
+dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"]
+dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"]
 glue = ["pyparsing (>=3.0.7)"]
 iotdata = ["jsondiff (>=1.1.2)"]
-route53resolver = ["sshpubkeys (>=3.1.0)"]
-s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
+resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"]
+s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"]
+s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"]
+server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"]
 ssm = ["PyYAML (>=5.1)"]
+stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+description = "Python library for arbitrary-precision floating-point arithmetic"
+optional = false
+python-versions = "*"
+files = [
+    {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"},
+    {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"},
+]
+
+[package.extras]
+develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"]
+docs = ["sphinx"]
+gmpy = ["gmpy2 (>=2.1.0a4)"]
+tests = ["pytest (>=4.6)"]
+
 [[package]]
 name = "multidict"
 version = "6.0.4"
@@ -1654,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"]
 
 [[package]]
 name = "openapi-schema-validator"
-version = "0.2.3"
+version = "0.4.4"
 description = "OpenAPI schema validation for Python"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
 files = [
-    {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"},
-    {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"},
+    {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"},
+    {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"},
 ]
 
 [package.dependencies]
-jsonschema = ">=3.0.0,<5.0.0"
+jsonschema = ">=4.0.0,<4.18.0"
+rfc3339-validator = "*"
 
 [package.extras]
-isodate = ["isodate"]
-rfc3339-validator = ["rfc3339-validator"]
-strict-rfc3339 = ["strict-rfc3339"]
+docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"]
 
 [[package]]
 name = "openapi-spec-validator"
-version = "0.4.0"
-description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator"
+version = "0.5.7"
+description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator"
 optional = false
 python-versions = ">=3.7.0,<4.0.0"
 files = [
-    {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"},
-    {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"},
+    {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"},
+    {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"},
 ]
 
 [package.dependencies]
-jsonschema = ">=3.2.0,<5.0.0"
-openapi-schema-validator = ">=0.2.0,<0.3.0"
-PyYAML = ">=5.1"
-setuptools = "*"
-
-[package.extras]
-requests = ["requests"]
+jsonschema = ">=4.0.0,<4.18.0"
+jsonschema-spec = ">=0.1.1,<0.2.0"
+lazy-object-proxy = ">=1.7.1,<2.0.0"
+openapi-schema-validator = ">=0.4.2,<0.5.0"
 
 [[package]]
 name = "packaging"
@@ -1702,6 +1813,17 @@ files = [
     {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]
 
+[[package]]
+name = "pathable"
+version = "0.4.3"
+description = "Object-oriented paths"
+optional = false
+python-versions = ">=3.7.0,<4.0.0"
+files = [
+    {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"},
+    {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"},
+]
+
 [[package]]
 name = "pbr"
 version = "5.9.0"
@@ -1728,6 +1850,17 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "ply"
+version = "3.11"
+description = "Python Lex & Yacc"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
+    {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.14.1"
@@ -1840,16 +1973,19 @@ files = [
 ]
 
 [[package]]
-name = "pyasn1"
-version = "0.4.8"
-description = "ASN.1 types and codecs"
+name = "py-partiql-parser"
+version = "0.5.4"
+description = "Pure Python PartiQL Parser"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
+    {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"},
+    {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"},
 ]
 
+[package.extras]
+dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"]
+
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -1861,6 +1997,116 @@ files = [
     {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]
 
+[[package]]
+name = "pydantic"
+version = "2.7.1"
+description = "Data validation using Python type hints"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"},
+    {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"},
+]
+
+[package.dependencies]
+annotated-types = ">=0.4.0"
+pydantic-core = "2.18.2"
+typing-extensions = ">=4.6.1"
+
+[package.extras]
+email = ["email-validator (>=2.0.0)"]
+
+[[package]]
+name = "pydantic-core"
+version = "2.18.2"
+description = "Core functionality for Pydantic validation and serialization"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"},
+    {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"},
+    {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"},
+    {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"},
+    {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"},
+    {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"},
+    {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"},
+    {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"},
+    {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"},
+    {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"},
+    {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"},
+    {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"},
+    {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"},
+    {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"},
+    {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"},
+    {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"},
+    {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"},
+    {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"},
+    {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"},
+    {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"},
+    {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
+
 [[package]]
 name = "pyjwt"
 version = "2.4.0"
@@ -2115,28 +2361,6 @@ files = [
 [package.dependencies]
 six = ">=1.5"
 
-[[package]]
-name = "python-jose"
-version = "3.3.0"
-description = "JOSE implementation in Python"
-optional = false
-python-versions = "*"
-files = [
-    {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"},
-    {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"},
-]
-
-[package.dependencies]
-cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""}
-ecdsa = "!=0.15"
-pyasn1 = "*"
-rsa = "*"
-
-[package.extras]
-cryptography = ["cryptography (>=3.4.0)"]
-pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
-pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -2216,15 +2440,103 @@ files = [
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
+[[package]]
+name = "regex"
+version = "2024.4.28"
+description = "Alternative regular expression module, to replace re."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"},
+    {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"},
+    {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"},
+    {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"},
+    {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"},
+    {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"},
+    {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"},
+    {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"},
+    {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"},
+    {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"},
+    {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"},
+    {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"},
+    {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"},
+    {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"},
+    {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"},
+    {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"},
+    {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"},
+    {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"},
+    {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"},
+    {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"},
+    {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"},
+    {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"},
+    {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"},
+    {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"},
+    {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"},
+    {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"},
+    {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"},
+]
+
 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.0"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
+    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
 ]
 
 [package.dependencies]
@@ -2256,18 +2568,18 @@ urllib3 = ">=1.25.10"
 tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"]
 
 [[package]]
-name = "rsa"
-version = "4.9"
-description = "Pure-Python RSA implementation"
+name = "rfc3339-validator"
+version = "0.1.4"
+description = "A pure python RFC3339 validator"
 optional = false
-python-versions = ">=3.6,<4"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
-    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
-    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+    {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"},
+    {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"},
 ]
 
 [package.dependencies]
-pyasn1 = ">=0.1.3"
+six = "*"
 
 [[package]]
 name = "ruff"
@@ -2366,22 +2678,18 @@ files = [
 ]
 
 [[package]]
-name = "sshpubkeys"
-version = "3.3.1"
-description = "SSH public key parser"
+name = "sympy"
+version = "1.12"
+description = "Computer algebra system (CAS) in Python"
 optional = false
-python-versions = ">=3"
+python-versions = ">=3.8"
 files = [
-    {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"},
-    {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"},
+    {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"},
+    {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"},
 ]
 
 [package.dependencies]
-cryptography = ">=2.1.4"
-ecdsa = ">=0.13"
-
-[package.extras]
-dev = ["twine", "wheel", "yapf"]
+mpmath = ">=0.19"
 
 [[package]]
 name = "toml"
@@ -2899,4 +3207,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "496d6d9f722983bda4d1265370bc8ba75560da74ab5d6b68c94a03290815e1eb"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 3002006aed..7da0763bc1 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -9,6 +9,7 @@ default = []
 testing = []
 
 [dependencies]
+ahash.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
@@ -24,8 +25,10 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
+framed-websockets.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -35,7 +38,6 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
-hyper-tungstenite.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -52,7 +54,6 @@ opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
-pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -76,7 +77,6 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
-sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
@@ -106,6 +106,8 @@ workspace_hack.workspace = true
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
+tokio-tungstenite.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3795e3b608..3555eba543 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -13,7 +13,7 @@ use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::validate_password_and_exchange;
+use crate::auth::{validate_password_and_exchange, AuthError};
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
@@ -23,7 +23,7 @@ use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
-use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
+use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
     auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -280,6 +280,7 @@ async fn auth_quirks(
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> auth::Result<ComputeCredentials> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
@@ -305,6 +306,10 @@ async fn auth_quirks(
     if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
     }
+
+    if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) {
+        return Err(AuthError::too_many_connections());
+    }
     let cached_secret = match maybe_secret {
         Some(secret) => secret,
         None => api.get_role_secret(ctx, &info).await?,
@@ -360,7 +365,10 @@ async fn authenticate_with_secret(
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
+        let ep = EndpointIdInt::from(&info.endpoint);
+
+        let auth_outcome =
+            validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
         let keys = match auth_outcome {
             crate::sasl::Outcome::Success(key) => key,
             crate::sasl::Outcome::Failure(reason) => {
@@ -381,7 +389,7 @@ async fn authenticate_with_secret(
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
         ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
+        return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
@@ -417,6 +425,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
         use BackendType::*;
 
@@ -428,8 +437,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
                     "performing authentication using the console"
                 );
 
-                let credentials =
-                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+                let credentials = auth_quirks(
+                    ctx,
+                    &*api,
+                    user_info,
+                    client,
+                    allow_cleartext,
+                    config,
+                    endpoint_rate_limiter,
+                )
+                .await?;
                 BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
@@ -539,8 +556,8 @@ mod tests {
         },
         context::RequestMonitoring,
         proxy::NeonOptions,
-        rate_limiter::RateBucketInfo,
-        scram::ServerSecret,
+        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
+        scram::{threadpool::ThreadPool, ServerSecret},
         stream::{PqStream, Stream},
     };
 
@@ -582,6 +599,7 @@ mod tests {
     }
 
     static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
+        thread_pool: ThreadPool::new(1),
         scram_protocol_timeout: std::time::Duration::from_secs(5),
         rate_limiter_enabled: true,
         rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
@@ -699,10 +717,20 @@ mod tests {
                 _ => panic!("wrong message"),
             }
         });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, false, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            false,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         handle.await.unwrap();
     }
@@ -739,10 +767,20 @@ mod tests {
             frontend::password_message(b"my-secret-password", &mut write).unwrap();
             client.write_all(&write).await.unwrap();
         });
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
 
-        let _creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let _creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         handle.await.unwrap();
     }
@@ -780,9 +818,20 @@ mod tests {
             client.write_all(&write).await.unwrap();
         });
 
-        let creds = auth_quirks(&mut ctx, &api, user_info, &mut stream, true, &CONFIG)
-            .await
-            .unwrap();
+        let endpoint_rate_limiter =
+            Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET));
+
+        let creds = auth_quirks(
+            &mut ctx,
+            &api,
+            user_info,
+            &mut stream,
+            true,
+            &CONFIG,
+            endpoint_rate_limiter,
+        )
+        .await
+        .unwrap();
 
         assert_eq!(creds.info.endpoint, "my-endpoint");
 
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index f7241be4a9..6b0f5e1726 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -3,8 +3,10 @@ use super::{
 };
 use crate::{
     auth::{self, AuthFlow},
+    config::AuthenticationConfig,
     console::AuthSecret,
     context::RequestMonitoring,
+    intern::EndpointIdInt,
     sasl,
     stream::{self, Stream},
 };
@@ -20,6 +22,7 @@ pub async fn authenticate_cleartext(
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
     warn!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -27,8 +30,14 @@ pub async fn authenticate_cleartext(
     // pause the timer while we communicate with the client
     let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
 
+    let ep = EndpointIdInt::from(&info.endpoint);
+
     let auth_flow = AuthFlow::new(client)
-        .begin(auth::CleartextPassword(secret))
+        .begin(auth::CleartextPassword {
+            secret,
+            endpoint: ep,
+            pool: config.thread_pool.clone(),
+        })
         .await?;
     drop(paused);
     // cleartext auth is only allowed to the ws/http protocol.
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 45bbad8cb2..59d1ac17f4 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -5,12 +5,14 @@ use crate::{
     config::TlsServerEndPoint,
     console::AuthSecret,
     context::RequestMonitoring,
-    sasl, scram,
+    intern::EndpointIdInt,
+    sasl,
+    scram::{self, threadpool::ThreadPool},
     stream::{PqStream, Stream},
 };
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
-use std::io;
+use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
@@ -53,7 +55,11 @@ impl AuthMethod for PasswordHack {
 
 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword(pub AuthSecret);
+pub struct CleartextPassword {
+    pub pool: Arc<ThreadPool>,
+    pub endpoint: EndpointIdInt,
+    pub secret: AuthSecret,
+}
 
 impl AuthMethod for CleartextPassword {
     #[inline(always)]
@@ -126,7 +132,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
             .strip_suffix(&[0])
             .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
 
-        let outcome = validate_password_and_exchange(password, self.state.0).await?;
+        let outcome = validate_password_and_exchange(
+            &self.state.pool,
+            self.state.endpoint,
+            password,
+            self.state.secret,
+        )
+        .await?;
 
         if let sasl::Outcome::Success(_) = &outcome {
             self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -181,6 +193,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
 }
 
 pub(crate) async fn validate_password_and_exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
     password: &[u8],
     secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -194,7 +208,7 @@ pub(crate) async fn validate_password_and_exchange(
         }
         // perform scram authentication as both client and server to validate the keys
         AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(&scram_secret, password).await?;
+            let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
 
             let client_key = match outcome {
                 sasl::Outcome::Success(client_key) => client_key,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index d345781814..5be0653a09 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
 use proxy::redis::notifications;
+use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
@@ -132,6 +133,9 @@ struct ProxyCliArgs {
     /// timeout for scram authentication protocol
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     scram_protocol_timeout: tokio::time::Duration,
+    /// size of the threadpool for password hashing
+    #[clap(long, default_value_t = 4)]
+    scram_thread_pool_size: u8,
     /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     require_client_ip: bool,
@@ -144,6 +148,9 @@ struct ProxyCliArgs {
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Wake compute rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    wake_compute_limit: Vec<RateBucketInfo>,
     /// Whether the auth rate limiter actually takes effect (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     auth_rate_limit_enabled: bool,
@@ -154,7 +161,7 @@ struct ProxyCliArgs {
     #[clap(long, default_value_t = 64)]
     auth_rate_limit_ip_subnet: u8,
     /// Redis rate limiter max number of requests per second.
-    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     redis_rps_limit: Vec<RateBucketInfo>,
     /// cache for `allowed_ips` (use `size=0` to disable)
     #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
@@ -365,6 +372,10 @@ async fn main() -> anyhow::Result<()> {
         proxy::metrics::CancellationSource::FromClient,
     ));
 
+    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
     let mut client_tasks = JoinSet::new();
@@ -373,6 +384,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
         cancellation_handler.clone(),
+        endpoint_rate_limiter.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -387,6 +399,7 @@ async fn main() -> anyhow::Result<()> {
             serverless_listener,
             cancellation_token.clone(),
             cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
         ));
     }
 
@@ -480,6 +493,9 @@ async fn main() -> anyhow::Result<()> {
 
 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
+    Metrics::install(thread_pool.metrics.clone());
+
     let tls_config = match (&args.tls_key, &args.tls_cert) {
         (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
             key_path,
@@ -559,11 +575,16 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             let url = args.auth_endpoint.parse()?;
             let endpoint = http::Endpoint::new(url, http::new_client());
 
-            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
-            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
-            let api =
-                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit));
+            let api = console::provider::neon::Api::new(
+                endpoint,
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
             let api = console::provider::ConsoleBackend::Console(api);
             auth::BackendType::Console(MaybeOwned::Owned(api), ())
         }
@@ -610,6 +631,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
     };
     let authentication_config = AuthenticationConfig {
+        thread_pool,
         scram_protocol_timeout: args.scram_protocol_timeout,
         rate_limiter_enabled: args.auth_rate_limit_enabled,
         rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index b7ab2c00f9..5a0c251ce2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -2,6 +2,7 @@ use crate::{
     auth::{self, backend::AuthRateLimiter},
     console::locks::ApiLocks,
     rate_limiter::RateBucketInfo,
+    scram::threadpool::ThreadPool,
     serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
     Host,
 };
@@ -61,6 +62,7 @@ pub struct HttpConfig {
 }
 
 pub struct AuthenticationConfig {
+    pub thread_pool: Arc<ThreadPool>,
     pub scram_protocol_timeout: tokio::time::Duration,
     pub rate_limiter_enabled: bool,
     pub rate_limiter: AuthRateLimiter,
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index ec66641d01..7728d2cafa 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -26,7 +26,7 @@ pub struct Api {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub locks: &'static ApiLocks<EndpointCacheKey>,
-    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    pub wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     jwt: String,
 }
 
@@ -36,7 +36,7 @@ impl Api {
         endpoint: http::Endpoint,
         caches: &'static ApiCaches,
         locks: &'static ApiLocks<EndpointCacheKey>,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+        wake_compute_endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     ) -> Self {
         let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
             Ok(v) => v,
@@ -46,7 +46,7 @@ impl Api {
             endpoint,
             caches,
             locks,
-            endpoint_rate_limiter,
+            wake_compute_endpoint_rate_limiter,
             jwt,
         }
     }
@@ -283,7 +283,7 @@ impl super::Api for Api {
 
         // check rate limit
         if !self
-            .endpoint_rate_limiter
+            .wake_compute_endpoint_rate_limiter
             .check(user_info.endpoint.normalize().into(), 1)
         {
             return Err(WakeComputeError::TooManyConnections);
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 8104fe6087..a213a32ca4 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -307,7 +307,7 @@ where
 }
 
 async fn upload_parquet(
-    w: SerializedFileWriter<Writer<BytesMut>>,
+    mut w: SerializedFileWriter<Writer<BytesMut>>,
     len: i64,
     storage: &GenericRemoteStorage,
 ) -> anyhow::Result<Writer<BytesMut>> {
@@ -319,11 +319,15 @@ async fn upload_parquet(
 
     // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
     // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
-    let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
+    let (mut buffer, metadata) =
+        tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> {
+            let metadata = w.finish()?;
+            let buffer = std::mem::take(w.inner_mut().get_mut());
+            Ok((buffer, metadata))
+        })
         .await
         .unwrap()?;
 
-    let mut buffer = writer.into_inner();
     let data = buffer.split().freeze();
 
     let compression = len as f64 / len_uncompressed as f64;
@@ -351,7 +355,7 @@ async fn upload_parquet(
         "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
     ))?;
     let cancel = CancellationToken::new();
-    backoff::retry(
+    let maybe_err = backoff::retry(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
             storage
@@ -368,7 +372,12 @@ async fn upload_parquet(
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")?;
+    .context("request_data_upload")
+    .err();
+
+    if let Some(err) = maybe_err {
+        tracing::warn!(%id, %err, "failed to upload request data");
+    }
 
     Ok(buffer.writer())
 }
@@ -474,10 +483,11 @@ mod tests {
         RequestData {
             session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
             peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
-            timestamp: chrono::NaiveDateTime::from_timestamp_millis(
+            timestamp: chrono::DateTime::from_timestamp_millis(
                 rng.gen_range(1703862754..1803862754),
             )
-            .unwrap(),
+            .unwrap()
+            .naive_utc(),
             application_name: Some("test".to_owned()),
             username: Some(hex::encode(rng.gen::<[u8; 4]>())),
             endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
@@ -560,15 +570,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
             ]
         );
 
@@ -598,11 +608,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1221738, 5, 10000),
-                (1227888, 5, 10000),
-                (1229682, 5, 10000),
-                (1229044, 5, 10000),
-                (1220322, 5, 10000)
+                (1222212, 5, 10000),
+                (1228362, 5, 10000),
+                (1230156, 5, 10000),
+                (1229518, 5, 10000),
+                (1220796, 5, 10000)
             ]
         );
 
@@ -634,11 +644,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1207385, 5, 10000),
-                (1207116, 5, 10000),
-                (1207409, 5, 10000),
-                (1207397, 5, 10000),
-                (1207652, 5, 10000)
+                (1207859, 5, 10000),
+                (1207590, 5, 10000),
+                (1207883, 5, 10000),
+                (1207871, 5, 10000),
+                (1208126, 5, 10000)
             ]
         );
 
@@ -663,15 +673,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
             ]
         );
 
@@ -708,7 +718,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
+            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 1590316925..e2a75a8720 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,11 +1,11 @@
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
 use measured::{
-    label::StaticLabelSet,
+    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
-    MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
+    LabelGroup, MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 
@@ -14,26 +14,36 @@ use tokio::time::{self, Instant};
 use crate::console::messages::ColdStartInfo;
 
 #[derive(MetricGroup)]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct Metrics {
     #[metric(namespace = "proxy")]
+    #[metric(init = ProxyMetrics::new(thread_pool))]
     pub proxy: ProxyMetrics,
 
     #[metric(namespace = "wake_compute_lock")]
     pub wake_compute_lock: ApiLockMetrics,
 }
 
+static SELF: OnceLock<Metrics> = OnceLock::new();
 impl Metrics {
+    pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
+        SELF.set(Metrics::new(thread_pool))
+            .ok()
+            .expect("proxy metrics must not be installed more than once");
+    }
+
     pub fn get() -> &'static Self {
-        static SELF: OnceLock<Metrics> = OnceLock::new();
-        SELF.get_or_init(|| Metrics {
-            proxy: ProxyMetrics::default(),
-            wake_compute_lock: ApiLockMetrics::new(),
-        })
+        #[cfg(test)]
+        return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
+
+        #[cfg(not(test))]
+        SELF.get()
+            .expect("proxy metrics must be installed by the main() function")
     }
 }
 
 #[derive(MetricGroup)]
-#[metric(new())]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct ProxyMetrics {
     #[metric(flatten)]
     pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
@@ -129,6 +139,10 @@ pub struct ProxyMetrics {
 
     #[metric(namespace = "connect_compute_lock")]
     pub connect_compute_lock: ApiLockMetrics,
+
+    #[metric(namespace = "scram_pool")]
+    #[metric(init = thread_pool)]
+    pub scram_pool: Arc<ThreadPoolMetrics>,
 }
 
 #[derive(MetricGroup)]
@@ -146,12 +160,6 @@ pub struct ApiLockMetrics {
     pub semaphore_acquire_seconds: Histogram<16>,
 }
 
-impl Default for ProxyMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl Default for ApiLockMetrics {
     fn default() -> Self {
         Self::new()
@@ -553,3 +561,52 @@ pub enum RedisEventsCount {
     PasswordUpdate,
     AllowedIpsUpdate,
 }
+
+pub struct ThreadPoolWorkers(usize);
+pub struct ThreadPoolWorkerId(pub usize);
+
+impl LabelValue for ThreadPoolWorkerId {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int(self.0 as i64)
+    }
+}
+
+impl LabelGroup for ThreadPoolWorkerId {
+    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
+        v.write_value(LabelName::from_str("worker"), self);
+    }
+}
+
+impl LabelSet for ThreadPoolWorkers {
+    type Value<'a> = ThreadPoolWorkerId;
+
+    fn dynamic_cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
+        (value.0 < self.0).then_some(value.0)
+    }
+
+    fn decode(&self, value: usize) -> Self::Value<'_> {
+        ThreadPoolWorkerId(value)
+    }
+}
+
+impl FixedCardinalitySet for ThreadPoolWorkers {
+    fn cardinality(&self) -> usize {
+        self.0
+    }
+}
+
+#[derive(MetricGroup)]
+#[metric(new(workers: usize))]
+pub struct ThreadPoolMetrics {
+    pub injector_queue_depth: Gauge,
+    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index e4e095d77d..5824b70df9 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,6 +19,7 @@ use crate::{
     metrics::{Metrics, NumClientConnectionsGuard},
     protocol2::read_proxy_protocol,
     proxy::handshake::{handshake, HandshakeData},
+    rate_limiter::EndpointRateLimiter,
     stream::{PqStream, Stream},
     EndpointCacheKey,
 };
@@ -61,6 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -86,6 +88,7 @@ pub async fn task_main(
         let cancellation_handler = Arc::clone(&cancellation_handler);
 
         tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+        let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
 
         connections.spawn(async move {
             let (socket, peer_addr) = match read_proxy_protocol(socket).await{
@@ -123,6 +126,7 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 ClientMode::Tcp,
+                endpoint_rate_limiter2,
                 conn_gauge,
             )
             .instrument(span.clone())
@@ -234,6 +238,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     mode: ClientMode,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     info!(
@@ -243,7 +248,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     let metrics = &Metrics::get().proxy;
     let proto = ctx.protocol;
-    // let _client_gauge = metrics.client_connections.guard(proto);
     let _request_gauge = metrics.connection_requests.guard(proto);
 
     let tls = config.tls_config.as_ref();
@@ -286,6 +290,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
             &mut stream,
             mode.allow_cleartext(),
             &config.authentication_config,
+            endpoint_rate_limiter,
         )
         .await
     {
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 5ba2c36436..b8c9490696 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -128,12 +128,18 @@ impl std::str::FromStr for RateBucketInfo {
 }
 
 impl RateBucketInfo {
-    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+    pub const DEFAULT_SET: [Self; 3] = [
         Self::new(300, Duration::from_secs(1)),
         Self::new(200, Duration::from_secs(60)),
         Self::new(100, Duration::from_secs(600)),
     ];
 
+    pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [
+        Self::new(500, Duration::from_secs(1)),
+        Self::new(300, Duration::from_secs(60)),
+        Self::new(200, Duration::from_secs(600)),
+    ];
+
     pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
         info.sort_unstable_by_key(|info| info.interval);
         let invalid = info
@@ -266,7 +272,7 @@ mod tests {
 
     #[test]
     fn default_rate_buckets() {
-        let mut defaults = RateBucketInfo::DEFAULT_ENDPOINT_SET;
+        let mut defaults = RateBucketInfo::DEFAULT_SET;
         RateBucketInfo::validate(&mut defaults[..]).unwrap();
     }
 
@@ -333,11 +339,8 @@ mod tests {
         let rand = rand::rngs::StdRng::from_seed([1; 32]);
         let hasher = BuildHasherDefault::<FxHasher>::default();
 
-        let limiter = BucketRateLimiter::new_with_rand_and_hasher(
-            &RateBucketInfo::DEFAULT_ENDPOINT_SET,
-            rand,
-            hasher,
-        );
+        let limiter =
+            BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher);
         for i in 0..1_000_000 {
             limiter.check(i, 1);
         }
diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs
index ed80675f8a..862facb4e5 100644
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -6,11 +6,14 @@
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>
 
+mod countmin;
 mod exchange;
 mod key;
 mod messages;
+mod pbkdf2;
 mod secret;
 mod signature;
+pub mod threadpool;
 
 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
@@ -56,9 +59,13 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
-    use crate::sasl::{Mechanism, Step};
+    use crate::{
+        intern::EndpointIdInt,
+        sasl::{Mechanism, Step},
+        EndpointId,
+    };
 
-    use super::{Exchange, ServerSecret};
+    use super::{threadpool::ThreadPool, Exchange, ServerSecret};
 
     #[test]
     fn snapshot() {
@@ -112,8 +119,13 @@ mod tests {
     }
 
     async fn run_round_trip_test(server_password: &str, client_password: &str) {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
         let scram_secret = ServerSecret::build(server_password).await.unwrap();
-        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
+        let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
             .await
             .unwrap();
 
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
new file mode 100644
index 0000000000..f2b794e5fe
--- /dev/null
+++ b/proxy/src/scram/countmin.rs
@@ -0,0 +1,173 @@
+use std::hash::Hash;
+
+/// estimator of hash jobs per second.
+/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
+pub struct CountMinSketch {
+    // one for each depth
+    hashers: Vec<ahash::RandomState>,
+    width: usize,
+    depth: usize,
+    // buckets, width*depth
+    buckets: Vec<u32>,
+}
+
+impl CountMinSketch {
+    /// Given parameters (ε, δ),
+    ///   set width = ceil(e/ε)
+    ///   set depth = ceil(ln(1/δ))
+    ///
+    /// guarantees:
+    /// actual <= estimate
+    /// estimate <= actual + ε * N with probability 1 - δ
+    /// where N is the cardinality of the stream
+    pub fn with_params(epsilon: f64, delta: f64) -> Self {
+        CountMinSketch::new(
+            (std::f64::consts::E / epsilon).ceil() as usize,
+            (1.0_f64 / delta).ln().ceil() as usize,
+        )
+    }
+
+    fn new(width: usize, depth: usize) -> Self {
+        Self {
+            #[cfg(test)]
+            hashers: (0..depth)
+                .map(|i| {
+                    // digits of pi for good randomness
+                    ahash::RandomState::with_seeds(
+                        314159265358979323,
+                        84626433832795028,
+                        84197169399375105,
+                        82097494459230781 + i as u64,
+                    )
+                })
+                .collect(),
+            #[cfg(not(test))]
+            hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
+            width,
+            depth,
+            buckets: vec![0; width * depth],
+        }
+    }
+
+    pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
+        let mut min = u32::MAX;
+        for row in 0..self.depth {
+            let col = (self.hashers[row].hash_one(t) as usize) % self.width;
+
+            let row = &mut self.buckets[row * self.width..][..self.width];
+            row[col] = row[col].saturating_add(x);
+            min = std::cmp::min(min, row[col]);
+        }
+        min
+    }
+
+    pub fn reset(&mut self) {
+        self.buckets.clear();
+        self.buckets.resize(self.width * self.depth, 0);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+
+    use super::CountMinSketch;
+
+    fn eval_precision(n: usize, p: f64, q: f64) -> usize {
+        // fixed value of phi for consistent test
+        let mut rng = StdRng::seed_from_u64(16180339887498948482);
+
+        #[allow(non_snake_case)]
+        let mut N = 0;
+
+        let mut ids = vec![];
+
+        for _ in 0..n {
+            // number of insert operations
+            let n = rng.gen_range(1..100);
+            // number to insert at once
+            let m = rng.gen_range(1..4096);
+
+            let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
+            ids.push((id, n, m));
+
+            // N = sum(actual)
+            N += n * m;
+        }
+
+        // q% of counts will be within p of the actual value
+        let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        dbg!(sketch.buckets.len());
+
+        // insert a bunch of entries in a random order
+        let mut ids2 = ids.clone();
+        while !ids2.is_empty() {
+            ids2.shuffle(&mut rng);
+
+            let mut i = 0;
+            while i < ids2.len() {
+                sketch.inc_and_return(&ids2[i].0, ids2[i].1);
+                ids2[i].2 -= 1;
+                if ids2[i].2 == 0 {
+                    ids2.remove(i);
+                } else {
+                    i += 1;
+                }
+            }
+        }
+
+        let mut within_p = 0;
+        for (id, n, m) in ids {
+            let actual = n * m;
+            let estimate = sketch.inc_and_return(&id, 0);
+
+            // This estimate has the guarantee that actual <= estimate
+            assert!(actual <= estimate);
+
+            // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
+            // ε = p / N, δ = 1 - q;
+            // therefore, estimate <= actual + p with probability q.
+            if estimate as f64 <= actual as f64 + p {
+                within_p += 1;
+            }
+        }
+        within_p
+    }
+
+    #[test]
+    fn precision() {
+        assert_eq!(eval_precision(100, 100.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
+
+        // seems to be more precise than the literature indicates?
+        // probably numbers are too small to truly represent the probabilities.
+        assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
+        assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
+    }
+
+    // returns memory usage in bytes, and the time complexity per insert.
+    fn eval_cost(p: f64, q: f64) -> (usize, usize) {
+        #[allow(non_snake_case)]
+        // N = sum(actual)
+        // Let's assume 1021 samples, all of 4096
+        let N = 1021 * 4096;
+        let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
+        let time = sketch.depth;
+        (memory, time)
+    }
+
+    #[test]
+    fn memory_usage() {
+        assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
+        assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
+        assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
+        assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
+    }
+}
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 89dd33e59f..d0adbc780e 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -4,15 +4,17 @@ use std::convert::Infallible;
 
 use hmac::{Hmac, Mac};
 use sha2::Sha256;
-use tokio::task::yield_now;
 
 use super::messages::{
     ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
+use super::pbkdf2::Pbkdf2;
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
+use super::threadpool::ThreadPool;
 use super::ScramKey;
 use crate::config;
+use crate::intern::EndpointIdInt;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};
 
 /// The only channel binding mode we currently support.
@@ -74,37 +76,18 @@ impl<'a> Exchange<'a> {
     }
 }
 
-// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
-async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
-    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-    let mut prev = hmac
-        .clone()
-        .chain_update(salt)
-        .chain_update(1u32.to_be_bytes())
-        .finalize()
-        .into_bytes();
-
-    let mut hi = prev;
-
-    for i in 1..iterations {
-        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
-
-        for (hi, prev) in hi.iter_mut().zip(prev) {
-            *hi ^= prev;
-        }
-        // yield every ~250us
-        // hopefully reduces tail latencies
-        if i % 1024 == 0 {
-            yield_now().await
-        }
-    }
-
-    hi.into()
-}
-
 // copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
-async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
-    let salted_password = pbkdf2(password, salt, iterations).await;
+async fn derive_client_key(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
+    password: &[u8],
+    salt: &[u8],
+    iterations: u32,
+) -> ScramKey {
+    let salted_password = pool
+        .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
+        .await
+        .expect("job should not be cancelled");
 
     let make_key = |name| {
         let key = Hmac::<Sha256>::new_from_slice(&salted_password)
@@ -119,11 +102,13 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr
 }
 
 pub async fn exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
     secret: &ServerSecret,
     password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
     let salt = base64::decode(&secret.salt_base64)?;
-    let client_key = derive_client_key(password, &salt, secret.iterations).await;
+    let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
 
     if secret.is_password_invalid(&client_key).into() {
         Ok(sasl::Outcome::Failure("password doesn't match"))
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
new file mode 100644
index 0000000000..a803ba7e1b
--- /dev/null
+++ b/proxy/src/scram/pbkdf2.rs
@@ -0,0 +1,89 @@
+use hmac::{
+    digest::{consts::U32, generic_array::GenericArray},
+    Hmac, Mac,
+};
+use sha2::Sha256;
+
+pub struct Pbkdf2 {
+    hmac: Hmac<Sha256>,
+    prev: GenericArray<u8, U32>,
+    hi: GenericArray<u8, U32>,
+    iterations: u32,
+}
+
+// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+impl Pbkdf2 {
+    pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
+        let hmac =
+            Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+
+        let prev = hmac
+            .clone()
+            .chain_update(salt)
+            .chain_update(1u32.to_be_bytes())
+            .finalize()
+            .into_bytes();
+
+        Self {
+            hmac,
+            // one consumed for the hash above
+            iterations: iterations - 1,
+            hi: prev,
+            prev,
+        }
+    }
+
+    pub fn cost(&self) -> u32 {
+        (self.iterations).clamp(0, 4096)
+    }
+
+    pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
+        let Self {
+            hmac,
+            prev,
+            hi,
+            iterations,
+        } = self;
+
+        // only do 4096 iterations per turn before sharing the thread for fairness
+        let n = (*iterations).clamp(0, 4096);
+        for _ in 0..n {
+            *prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
+
+            for (hi, prev) in hi.iter_mut().zip(*prev) {
+                *hi ^= prev;
+            }
+        }
+
+        *iterations -= n;
+        if *iterations == 0 {
+            std::task::Poll::Ready((*hi).into())
+        } else {
+            std::task::Poll::Pending
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Pbkdf2;
+    use pbkdf2::pbkdf2_hmac_array;
+    use sha2::Sha256;
+
+    #[test]
+    fn works() {
+        let salt = b"sodium chloride";
+        let pass = b"Ne0n_!5_50_C007";
+
+        let mut job = Pbkdf2::start(pass, salt, 600000);
+        let hash = loop {
+            let std::task::Poll::Ready(hash) = job.turn() else {
+                continue;
+            };
+            break hash;
+        };
+
+        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
+        assert_eq!(hash, expected)
+    }
+}
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
new file mode 100644
index 0000000000..7701b869a3
--- /dev/null
+++ b/proxy/src/scram/threadpool.rs
@@ -0,0 +1,321 @@
+//! Custom threadpool implementation for password hashing.
+//!
+//! Requirements:
+//! 1. Fairness per endpoint.
+//! 2. Yield support for high iteration counts.
+
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    Arc,
+};
+
+use crossbeam_deque::{Injector, Stealer, Worker};
+use itertools::Itertools;
+use parking_lot::{Condvar, Mutex};
+use rand::Rng;
+use rand::{rngs::SmallRng, SeedableRng};
+use tokio::sync::oneshot;
+
+use crate::{
+    intern::EndpointIdInt,
+    metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
+    scram::countmin::CountMinSketch,
+};
+
+use super::pbkdf2::Pbkdf2;
+
+pub struct ThreadPool {
+    queue: Injector<JobSpec>,
+    stealers: Vec<Stealer<JobSpec>>,
+    parkers: Vec<(Condvar, Mutex<ThreadState>)>,
+    /// bitpacked representation.
+    /// lower 8 bits = number of sleeping threads
+    /// next 8 bits = number of idle threads (searching for work)
+    counters: AtomicU64,
+
+    pub metrics: Arc<ThreadPoolMetrics>,
+}
+
+#[derive(PartialEq)]
+enum ThreadState {
+    Parked,
+    Active,
+}
+
+impl ThreadPool {
+    pub fn new(n_workers: u8) -> Arc<Self> {
+        let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
+        let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
+
+        let parkers = (0..n_workers)
+            .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
+            .collect_vec();
+
+        let pool = Arc::new(Self {
+            queue: Injector::new(),
+            stealers,
+            parkers,
+            // threads start searching for work
+            counters: AtomicU64::new((n_workers as u64) << 8),
+            metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+        });
+
+        for (i, worker) in workers.into_iter().enumerate() {
+            let pool = Arc::clone(&pool);
+            std::thread::spawn(move || thread_rt(pool, worker, i));
+        }
+
+        pool
+    }
+
+    pub fn spawn_job(
+        &self,
+        endpoint: EndpointIdInt,
+        pbkdf2: Pbkdf2,
+    ) -> oneshot::Receiver<[u8; 32]> {
+        let (tx, rx) = oneshot::channel();
+
+        let queue_was_empty = self.queue.is_empty();
+
+        self.metrics.injector_queue_depth.inc();
+        self.queue.push(JobSpec {
+            response: tx,
+            pbkdf2,
+            endpoint,
+        });
+
+        // inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
+        let counts = self.counters.load(Ordering::SeqCst);
+        let num_awake_but_idle = (counts >> 8) & 0xff;
+        let num_sleepers = counts & 0xff;
+
+        // If the queue is non-empty, then we always wake up a worker
+        // -- clearly the existing idle jobs aren't enough. Otherwise,
+        // check to see if we have enough idle workers.
+        if !queue_was_empty || num_awake_but_idle == 0 {
+            let num_to_wake = Ord::min(1, num_sleepers);
+            self.wake_any_threads(num_to_wake);
+        }
+
+        rx
+    }
+
+    #[cold]
+    fn wake_any_threads(&self, mut num_to_wake: u64) {
+        if num_to_wake > 0 {
+            for i in 0..self.parkers.len() {
+                if self.wake_specific_thread(i) {
+                    num_to_wake -= 1;
+                    if num_to_wake == 0 {
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    fn wake_specific_thread(&self, index: usize) -> bool {
+        let (condvar, lock) = &self.parkers[index];
+
+        let mut state = lock.lock();
+        if *state == ThreadState::Parked {
+            condvar.notify_one();
+
+            // When the thread went to sleep, it will have incremented
+            // this value. When we wake it, its our job to decrement
+            // it. We could have the thread do it, but that would
+            // introduce a delay between when the thread was
+            // *notified* and when this counter was decremented. That
+            // might mislead people with new work into thinking that
+            // there are sleeping threads that they should try to
+            // wake, when in fact there is nothing left for them to
+            // do.
+            self.counters.fetch_sub(1, Ordering::SeqCst);
+            *state = ThreadState::Active;
+
+            true
+        } else {
+            false
+        }
+    }
+
+    fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
+        // announce thread as idle
+        self.counters.fetch_add(256, Ordering::SeqCst);
+
+        // try steal from the global queue
+        loop {
+            match self.queue.steal_batch_and_pop(worker) {
+                crossbeam_deque::Steal::Success(job) => {
+                    self.metrics
+                        .injector_queue_depth
+                        .set(self.queue.len() as i64);
+                    // no longer idle
+                    self.counters.fetch_sub(256, Ordering::SeqCst);
+                    return Some(job);
+                }
+                crossbeam_deque::Steal::Retry => continue,
+                crossbeam_deque::Steal::Empty => break,
+            }
+        }
+
+        // try steal from our neighbours
+        loop {
+            let mut retry = false;
+            let start = rng.gen_range(0..self.stealers.len());
+            let job = (start..self.stealers.len())
+                .chain(0..start)
+                .filter(|i| *i != skip)
+                .find_map(
+                    |victim| match self.stealers[victim].steal_batch_and_pop(worker) {
+                        crossbeam_deque::Steal::Success(job) => Some(job),
+                        crossbeam_deque::Steal::Empty => None,
+                        crossbeam_deque::Steal::Retry => {
+                            retry = true;
+                            None
+                        }
+                    },
+                );
+            if job.is_some() {
+                // no longer idle
+                self.counters.fetch_sub(256, Ordering::SeqCst);
+                return job;
+            }
+            if !retry {
+                return None;
+            }
+        }
+    }
+}
+
+fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
+    /// interval when we should steal from the global queue
+    /// so that tail latencies are managed appropriately
+    const STEAL_INTERVAL: usize = 61;
+
+    /// How often to reset the sketch values
+    const SKETCH_RESET_INTERVAL: usize = 1021;
+
+    let mut rng = SmallRng::from_entropy();
+
+    // used to determine whether we should temporarily skip tasks for fairness.
+    // 99% of estimates will overcount by no more than 4096 samples
+    let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
+
+    let (condvar, lock) = &pool.parkers[index];
+
+    'wait: loop {
+        // wait for notification of work
+        {
+            let mut lock = lock.lock();
+
+            // queue is empty
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), 0);
+
+            // subtract 1 from idle count, add 1 to sleeping count.
+            pool.counters.fetch_sub(255, Ordering::SeqCst);
+
+            *lock = ThreadState::Parked;
+            condvar.wait(&mut lock);
+        }
+
+        for i in 0.. {
+            let mut job = match worker
+                .pop()
+                .or_else(|| pool.steal(&mut rng, index, &worker))
+            {
+                Some(job) => job,
+                None => continue 'wait,
+            };
+
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), worker.len() as i64);
+
+            // receiver is closed, cancel the task
+            if !job.response.is_closed() {
+                let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
+
+                const P: f64 = 2000.0;
+                // probability decreases as rate increases.
+                // lower probability, higher chance of being skipped
+                //
+                // estimates (rate in terms of 4096 rounds):
+                // rate = 0    => probability = 100%
+                // rate = 10   => probability = 71.3%
+                // rate = 50   => probability = 62.1%
+                // rate = 500  => probability = 52.3%
+                // rate = 1021 => probability = 49.8%
+                //
+                // My expectation is that the pool queue will only begin backing up at ~1000rps
+                // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
+                // are in requests per second.
+                let probability = P.ln() / (P + rate as f64).ln();
+                if pool.queue.len() > 32 || rng.gen_bool(probability) {
+                    pool.metrics
+                        .worker_task_turns_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    match job.pbkdf2.turn() {
+                        std::task::Poll::Ready(result) => {
+                            let _ = job.response.send(result);
+                        }
+                        std::task::Poll::Pending => worker.push(job),
+                    }
+                } else {
+                    pool.metrics
+                        .worker_task_skips_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    // skip for now
+                    worker.push(job)
+                }
+            }
+
+            // if we get stuck with a few long lived jobs in the queue
+            // it's better to try and steal from the queue too for fairness
+            if i % STEAL_INTERVAL == 0 {
+                let _ = pool.queue.steal_batch(&worker);
+            }
+
+            if i % SKETCH_RESET_INTERVAL == 0 {
+                sketch.reset();
+            }
+        }
+    }
+}
+
+struct JobSpec {
+    response: oneshot::Sender<[u8; 32]>,
+    pbkdf2: Pbkdf2,
+    endpoint: EndpointIdInt,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::EndpointId;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn hash_is_correct() {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
+        let salt = [0x55; 32];
+        let actual = pool
+            .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
+            .await
+            .unwrap();
+
+        let expected = [
+            10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
+            178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
+        ];
+        assert_eq!(actual, expected)
+    }
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index cbff51f207..24ee749e6e 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -36,6 +36,7 @@ use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
 use crate::protocol2::read_proxy_protocol;
 use crate::proxy::run_until_cancelled;
+use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
@@ -54,6 +55,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -82,6 +84,7 @@ pub async fn task_main(
     let backend = Arc::new(PoolingBackend {
         pool: Arc::clone(&conn_pool),
         config,
+        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
     });
 
     let tls_config = match config.tls_config.as_ref() {
@@ -99,7 +102,7 @@ pub async fn task_main(
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
 
-    let server = Builder::new(hyper_util::rt::TokioExecutor::new());
+    let server = Builder::new(TokioExecutor::new());
 
     while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
         let (conn, peer_addr) = res.context("could not accept TCP stream")?;
@@ -129,6 +132,7 @@ pub async fn task_main(
             backend.clone(),
             connections.clone(),
             cancellation_handler.clone(),
+            endpoint_rate_limiter.clone(),
             conn_token.clone(),
             server.clone(),
             tls_acceptor.clone(),
@@ -162,6 +166,7 @@ async fn connection_handler(
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     server: Builder<TokioExecutor>,
     tls_acceptor: TlsAcceptor,
@@ -245,11 +250,11 @@ async fn connection_handler(
                     session_id,
                     peer_addr,
                     http_request_token,
+                    endpoint_rate_limiter.clone(),
                 )
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
             );
-
             async move {
                 let res = handler.await;
                 cancel_request.disarm();
@@ -285,6 +290,7 @@ async fn request_handler(
     peer_addr: IpAddr,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> Result<Response<Full<Bytes>>, ApiError> {
     let host = request
         .headers()
@@ -294,7 +300,7 @@ async fn request_handler(
         .map(|s| s.to_string());
 
     // Check if the request is a websocket upgrade request.
-    if hyper_tungstenite::is_upgrade_request(&request) {
+    if framed_websockets::upgrade::is_upgrade_request(&request) {
         let ctx = RequestMonitoring::new(
             session_id,
             peer_addr,
@@ -305,14 +311,20 @@ async fn request_handler(
         let span = ctx.span.clone();
         info!(parent: &span, "performing websocket upgrade");
 
-        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+        let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
         ws_connections.spawn(
             async move {
-                if let Err(e) =
-                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
-                        .await
+                if let Err(e) = websocket::serve_websocket(
+                    config,
+                    ctx,
+                    websocket,
+                    cancellation_handler,
+                    endpoint_rate_limiter,
+                    host,
+                )
+                .await
                 {
                     error!("error in websocket connection: {e:#}");
                 }
@@ -321,7 +333,7 @@ async fn request_handler(
         );
 
         // Return the response so the spawned future can continue.
-        Ok(response)
+        Ok(response.map(|_: http_body_util::Empty<Bytes>| Full::new(Bytes::new())))
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
         let ctx = RequestMonitoring::new(
             session_id,
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index ce58f575e2..52fc7b556a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -15,7 +15,9 @@ use crate::{
     },
     context::RequestMonitoring,
     error::{ErrorKind, ReportableError, UserFacingError},
+    intern::EndpointIdInt,
     proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
+    rate_limiter::EndpointRateLimiter,
     Host,
 };
 
@@ -24,6 +26,7 @@ use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     pub config: &'static ProxyConfig,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }
 
 impl PoolingBackend {
@@ -39,6 +42,12 @@ impl PoolingBackend {
         if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
             return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
         }
+        if !self
+            .endpoint_rate_limiter
+            .check(conn_info.user_info.endpoint.clone().into(), 1)
+        {
+            return Err(AuthError::too_many_connections());
+        }
         let cached_secret = match maybe_secret {
             Some(secret) => secret,
             None => backend.get_role_secret(ctx).await?,
@@ -58,8 +67,14 @@ impl PoolingBackend {
                 return Err(AuthError::auth_failed(&*user_info.user));
             }
         };
-        let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
+        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
+        let auth_outcome = crate::auth::validate_password_and_exchange(
+            &config.thread_pool,
+            ep,
+            &conn_info.password,
+            secret,
+        )
+        .await?;
         let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => {
                 info!("user successfully authenticated");
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index b6cd85af73..61d6d60dbe 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -5,11 +5,13 @@ use crate::{
     error::{io_error, ReportableError},
     metrics::Metrics,
     proxy::{handle_client, ClientMode},
+    rate_limiter::EndpointRateLimiter,
 };
-use bytes::{Buf, Bytes};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use framed_websockets::{Frame, OpCode, WebSocketServer};
 use futures::{Sink, Stream};
-use hyper::upgrade::Upgraded;
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
+use hyper1::upgrade::OnUpgrade;
+use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;
 
 use std::{
@@ -20,25 +22,23 @@ use std::{
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;
 
-// TODO: use `std::sync::Exclusive` once it's stabilized.
-// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
-use sync_wrapper::SyncWrapper;
-
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw<S = Upgraded> {
+    pub struct WebSocketRw<S> {
         #[pin]
-        stream: SyncWrapper<WebSocketStream<S>>,
-        bytes: Bytes,
+        stream: WebSocketServer<S>,
+        recv: Bytes,
+        send: BytesMut,
     }
 }
 
 impl<S> WebSocketRw<S> {
-    pub fn new(stream: WebSocketStream<S>) -> Self {
+    pub fn new(stream: WebSocketServer<S>) -> Self {
         Self {
-            stream: stream.into(),
-            bytes: Bytes::new(),
+            stream,
+            recv: Bytes::new(),
+            send: BytesMut::new(),
         }
     }
 }
@@ -49,22 +49,24 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
         cx: &mut Context<'_>,
         buf: &[u8],
     ) -> Poll<io::Result<usize>> {
-        let mut stream = self.project().stream.get_pin_mut();
+        let this = self.project();
+        let mut stream = this.stream;
+        this.send.put(buf);
 
         ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
-        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+        match stream.as_mut().start_send(Frame::binary(this.send.split())) {
             Ok(()) => Poll::Ready(Ok(buf.len())),
             Err(e) => Poll::Ready(Err(io_error(e))),
         }
     }
 
     fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
         stream.poll_flush(cx).map_err(io_error)
     }
 
     fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
+        let stream = self.project().stream;
         stream.poll_close(cx).map_err(io_error)
     }
 }
@@ -75,13 +77,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
         cx: &mut Context<'_>,
         buf: &mut ReadBuf<'_>,
     ) -> Poll<io::Result<()>> {
-        if buf.remaining() > 0 {
-            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
-            let len = std::cmp::min(bytes.len(), buf.remaining());
-            buf.put_slice(&bytes[..len]);
-            self.consume(len);
-        }
-
+        let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+        let len = std::cmp::min(bytes.len(), buf.remaining());
+        buf.put_slice(&bytes[..len]);
+        self.consume(len);
         Poll::Ready(Ok(()))
     }
 }
@@ -93,31 +92,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
 
         let mut this = self.project();
         loop {
-            if !this.bytes.chunk().is_empty() {
-                let chunk = (*this.bytes).chunk();
+            if !this.recv.chunk().is_empty() {
+                let chunk = (*this.recv).chunk();
                 return Poll::Ready(Ok(chunk));
             }
 
-            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            let res = ready!(this.stream.as_mut().poll_next(cx));
             match res.transpose().map_err(io_error)? {
-                Some(message) => match message {
-                    Message::Ping(_) => {}
-                    Message::Pong(_) => {}
-                    Message::Text(text) => {
+                Some(message) => match message.opcode {
+                    OpCode::Ping => {}
+                    OpCode::Pong => {}
+                    OpCode::Text => {
                         // We expect to see only binary messages.
                         let error = "unexpected text message in the websocket";
-                        warn!(length = text.len(), error);
+                        warn!(length = message.payload.len(), error);
                         return Poll::Ready(Err(io_error(error)));
                     }
-                    Message::Frame(_) => {
-                        // This case is impossible according to Frame's doc.
-                        panic!("unexpected raw frame in the websocket");
+                    OpCode::Binary | OpCode::Continuation => {
+                        debug_assert!(this.recv.is_empty());
+                        *this.recv = message.payload.freeze();
                     }
-                    Message::Binary(chunk) => {
-                        assert!(this.bytes.is_empty());
-                        *this.bytes = Bytes::from(chunk);
-                    }
-                    Message::Close(_) => return EOF,
+                    OpCode::Close => return EOF,
                 },
                 None => return EOF,
             }
@@ -125,18 +120,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 
     fn consume(self: Pin<&mut Self>, amount: usize) {
-        self.project().bytes.advance(amount);
+        self.project().recv.advance(amount);
     }
 }
 
 pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
-    websocket: HyperWebsocket,
+    websocket: OnUpgrade,
     cancellation_handler: Arc<CancellationHandlerMain>,
+    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
+    let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
+
     let conn_gauge = Metrics::get()
         .proxy
         .client_connections
@@ -148,6 +146,7 @@ pub async fn serve_websocket(
         cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
+        endpoint_rate_limiter,
         conn_gauge,
     )
     .await;
@@ -174,15 +173,16 @@ pub async fn serve_websocket(
 mod tests {
     use std::pin::pin;
 
+    use framed_websockets::WebSocketServer;
     use futures::{SinkExt, StreamExt};
-    use hyper_tungstenite::{
-        tungstenite::{protocol::Role, Message},
-        WebSocketStream,
-    };
     use tokio::{
         io::{duplex, AsyncReadExt, AsyncWriteExt},
         task::JoinSet,
     };
+    use tokio_tungstenite::{
+        tungstenite::{protocol::Role, Message},
+        WebSocketStream,
+    };
 
     use super::WebSocketRw;
 
@@ -207,9 +207,7 @@ mod tests {
         });
 
         js.spawn(async move {
-            let mut rw = pin!(WebSocketRw::new(
-                WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
-            ));
+            let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2)));
 
             let mut buf = vec![0; 1024];
             let n = rw.read(&mut buf).await.unwrap();
diff --git a/pyproject.toml b/pyproject.toml
index 4ec8efc2ff..131d1121f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
-requests = "^2.31.0"
+requests = "^2.32.0"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"
@@ -19,7 +19,7 @@ types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
 boto3 = "^1.34.11"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
-moto = {extras = ["server"], version = "^4.1.2"}
+moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
diff --git a/s3_scrubber/README.md b/s3_scrubber/README.md
index c1deab8852..8a96542ada 100644
--- a/s3_scrubber/README.md
+++ b/s3_scrubber/README.md
@@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants
 
 #### S3
 
-Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help).
+Do `aws sso login --profile dev` to get the SSO access to the bucket to clean.
+Also, set the following environment variables:
 
-- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets
+- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`)
 - `REGION`: A region where the bucket is located at.
 - `BUCKET`: Bucket name
+- `BUCKET_PREFIX` (optional): Prefix inside the bucket
 
 #### Console API
 
@@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand.
 
 Example:
 
-`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json`
 
 #### `purge-garbage`
 
@@ -59,7 +61,7 @@ to pass them on the command line
 
 Example:
 
-`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
+`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json`
 
 Add the `--delete` argument before `purge-garbage` to enable deletion.  This is intentionally
 not provided inline in the example above to avoid accidents.  Without the `--delete` flag
@@ -72,7 +74,7 @@ Errors are logged to stderr and summary to stdout.
 
 For pageserver:
 ```
-env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
+env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver
 
 Timelines: 31106
 With errors: 3
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 7c0f699958..dd64a0a98f 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -13,7 +13,7 @@ use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
 use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::RemotePath;
 
@@ -110,7 +110,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
                     for (layer, metadata) in index_part.layer_metadata {
                         if metadata.file_size == 0 {
                             result.errors.push(format!(
-                                "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(),
+                                "index_part.json contains a layer {} that has 0 size in its layer metadata", layer,
                             ))
                         }
 
@@ -121,7 +121,7 @@ pub(crate) fn branch_cleanup_and_check_errors(
                             // layer we think is missing.
                             result.errors.push(format!(
                                 "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
-                                layer.file_name(),
+                                layer,
                                 metadata.generation.get_suffix(),
                                 metadata.shard
                             ))
@@ -170,8 +170,7 @@ pub(crate) struct LayerRef {
 /// the tenant to query whether an object exists.
 #[derive(Default)]
 pub(crate) struct TenantObjectListing {
-    shard_timelines:
-        HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
+    shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>,
 }
 
 impl TenantObjectListing {
@@ -180,7 +179,7 @@ impl TenantObjectListing {
     pub(crate) fn push(
         &mut self,
         ttid: TenantShardTimelineId,
-        layers: HashSet<(LayerFileName, Generation)>,
+        layers: HashSet<(LayerName, Generation)>,
     ) {
         let shard_index = ShardIndex::new(
             ttid.tenant_shard_id.shard_number,
@@ -208,7 +207,7 @@ impl TenantObjectListing {
     pub(crate) fn check_ref(
         &mut self,
         timeline_id: TimelineId,
-        layer_file: &LayerFileName,
+        layer_file: &LayerName,
         metadata: &IndexLayerMetadata,
     ) -> bool {
         let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
@@ -224,7 +223,7 @@ impl TenantObjectListing {
         true
     }
 
-    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
+    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> {
         let mut result = Vec::new();
         for ((shard_index, timeline_id), layers) in &self.shard_timelines {
             for ((layer_file, generation), layer_ref) in layers {
@@ -247,25 +246,25 @@ pub(crate) struct S3TimelineBlobData {
 #[derive(Debug)]
 pub(crate) enum BlobDataParseResult {
     Parsed {
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
         index_part_generation: Generation,
-        s3_layers: HashSet<(LayerFileName, Generation)>,
+        s3_layers: HashSet<(LayerName, Generation)>,
     },
     /// The remains of a deleted Timeline (i.e. an initdb archive only)
     Relic,
     Incorrect(Vec<String>),
 }
 
-fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> {
+fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
     match name.rsplit_once('-') {
         // FIXME: this is gross, just use a regex?
         Some((layer_filename, gen)) if gen.len() == 8 => {
-            let layer = layer_filename.parse::<LayerFileName>()?;
+            let layer = layer_filename.parse::<LayerName>()?;
             let gen =
                 Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
             Ok((layer, gen))
         }
-        _ => Ok((name.parse::<LayerFileName>()?, Generation::none())),
+        _ => Ok((name.parse::<LayerName>()?, Generation::none())),
     }
 }
 
@@ -369,7 +368,7 @@ pub(crate) async fn list_timeline_blobs(
             Ok(index_part) => {
                 return Ok(S3TimelineBlobData {
                     blob_data: BlobDataParseResult::Parsed {
-                        index_part,
+                        index_part: Box::new(index_part),
                         index_part_generation,
                         s3_layers,
                     },
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index e976e66748..e0f99ecd9c 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -200,30 +200,15 @@ impl RootTarget {
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
 pub struct BucketConfig {
     pub region: String,
     pub bucket: String,
     pub prefix_in_bucket: Option<String>,
-
-    /// Use SSO if this is set, else rely on AWS_* environment vars
-    pub sso_account_id: Option<String>,
-}
-
-impl Display for BucketConfig {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{}/{}/{}",
-            self.sso_account_id.as_deref().unwrap_or("<none>"),
-            self.region,
-            self.bucket
-        )
-    }
 }
 
 impl BucketConfig {
     pub fn from_env() -> anyhow::Result<Self> {
-        let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
         let region = env::var("REGION").context("'REGION' param retrieval")?;
         let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
         let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();
@@ -232,7 +217,6 @@ impl BucketConfig {
             region,
             bucket,
             prefix_in_bucket,
-            sso_account_id,
         })
     }
 }
@@ -276,7 +260,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
     guard
 }
 
-pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Client {
+pub fn init_s3_client(bucket_region: Region) -> Client {
     let credentials_provider = {
         // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
         let chain = CredentialsProviderChain::first_try(
@@ -290,7 +274,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         );
 
         // Use SSO if we were given an account ID
-        match account_id {
+        match std::env::var("SSO_ACCOUNT_ID").ok() {
             Some(sso_account) => chain.or_else(
                 "sso",
                 SsoCredentialsProvider::builder()
@@ -312,7 +296,10 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
     let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
 
     let mut builder = Config::builder()
-        .behavior_version(BehaviorVersion::v2023_11_09())
+        .behavior_version(
+            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
+            BehaviorVersion::v2023_11_09(),
+        )
         .region(bucket_region)
         .retry_config(RetryConfig::adaptive().with_max_attempts(3))
         .sleep_impl(SharedAsyncSleep::from(sleep_impl))
@@ -331,7 +318,7 @@ fn init_remote(
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
     let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region));
 
     let s3_root = match node_kind {
         NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
diff --git a/s3_scrubber/src/tenant_snapshot.rs b/s3_scrubber/src/tenant_snapshot.rs
index 4eccad381b..a24a1e92ae 100644
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -12,7 +12,7 @@ use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerFileName;
+use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
 use utils::generation::Generation;
@@ -48,16 +48,16 @@ impl SnapshotDownloader {
     async fn download_layer(
         &self,
         ttid: TenantShardTimelineId,
-        layer_name: LayerFileName,
+        layer_name: LayerName,
         layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerFileName, IndexLayerMetadata)> {
+    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
         // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
         // different layer names (remote-style has the generation suffix)
         let local_path = self.output_path.join(format!(
             "{}/timelines/{}/{}{}",
             ttid.tenant_shard_id,
             ttid.timeline_id,
-            layer_name.file_name(),
+            layer_name,
             layer_metadata.generation.get_suffix()
         ));
 
@@ -76,7 +76,7 @@ impl SnapshotDownloader {
             let remote_layer_path = format!(
                 "{}{}{}",
                 timeline_root.prefix_in_bucket,
-                layer_name.file_name(),
+                layer_name,
                 layer_metadata.generation.get_suffix()
             );
 
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
     async fn download_layers(
         &self,
         ttid: TenantShardTimelineId,
-        layers: Vec<(LayerFileName, IndexLayerMetadata)>,
+        layers: Vec<(LayerName, IndexLayerMetadata)>,
     ) -> anyhow::Result<()> {
         let layer_count = layers.len();
         tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -138,7 +138,7 @@ impl SnapshotDownloader {
                     tracing::info!(
                         "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}",
                         layer_metadata.file_size,
-                        layer_name.file_name()
+                        layer_name
                     );
                 }
                 Err(e) => {
@@ -159,11 +159,11 @@ impl SnapshotDownloader {
     async fn download_timeline(
         &self,
         ttid: TenantShardTimelineId,
-        index_part: IndexPart,
+        index_part: Box<IndexPart>,
         index_part_generation: Generation,
         ancestor_layers: &mut HashMap<
             TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
+            HashMap<LayerName, IndexLayerMetadata>,
         >,
     ) -> anyhow::Result<()> {
         let index_bytes = serde_json::to_string(&index_part).unwrap();
@@ -234,7 +234,7 @@ impl SnapshotDownloader {
         // happen if this tenant has been split at some point)
         let mut ancestor_layers: HashMap<
             TenantShardTimelineId,
-            HashMap<LayerFileName, IndexLayerMetadata>,
+            HashMap<LayerName, IndexLayerMetadata>,
         > = Default::default();
 
         for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 09c565ce71..aee3898ac7 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -20,7 +20,6 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
-use tokio::sync::mpsc;
 
 use tracing::*;
 use utils::pid_file;
@@ -30,13 +29,13 @@ use safekeeper::defaults::{
     DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
+use safekeeper::remove_wal;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
 use safekeeper::{http, WAL_REMOVER_RUNTIME};
-use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -377,8 +376,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let timeline_collector = safekeeper::metrics::TimelineCollector::new();
     metrics::register_internal(Box::new(timeline_collector))?;
 
-    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
-
     wal_backup::init_remote_storage(&conf);
 
     // Keep handles to main tasks to die if any of them disappears.
@@ -391,19 +388,9 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let current_thread_rt = conf
         .current_thread_runtime
         .then(|| Handle::try_current().expect("no runtime in main"));
-    let conf_ = conf.clone();
-    let wal_backup_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
-        .spawn(wal_backup::wal_backup_launcher_task_main(
-            conf_,
-            wal_backup_launcher_rx,
-        ))
-        .map(|res| ("WAL backup launcher".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_backup_handle));
 
     // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
+    GlobalTimelines::init(conf.clone()).await?;
 
     let conf_ = conf.clone();
     // Run everything in current thread rt, if asked.
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 98f58d3e49..7cc2142291 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -46,6 +46,8 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
         return Ok(());
     }
 
+    let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
+
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
     let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -57,15 +59,9 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
             // sensitive and there is no risk of deadlock as we don't await while
             // lock is held.
             let now = Instant::now();
-            let all_tlis = GlobalTimelines::get_all();
+            let all_tlis = active_timelines_set.get_all();
             let mut n_pushed_tlis = 0;
             for tli in &all_tlis {
-                // filtering alternative futures::stream::iter(all_tlis)
-                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
-                // doesn't look better, and I'm not sure how to do that without collect.
-                if !tli.is_active().await {
-                    continue;
-                }
                 let sk_info = tli.get_safekeeper_info(&conf).await;
                 yield sk_info;
                 BROKER_PUSHED_UPDATES.inc();
@@ -90,6 +86,7 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }
 
 /// Subscribe and fetch all the interesting data from the broker.
+#[instrument(name = "broker pull", skip_all)]
 async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
     let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
 
@@ -186,6 +183,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
                         commit_lsn: sk_info.commit_lsn,
                         safekeeper_connstr: sk_info.safekeeper_connstr,
                         availability_zone: sk_info.availability_zone,
+                        standby_horizon: 0,
                     };
 
                     // note this is a blocking call
@@ -319,7 +317,7 @@ async fn task_stats(stats: Arc<BrokerStats>) {
 
                 let now = BrokerStats::now_millis();
                 if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
-                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
                     info!("no broker updates for some time, last update: {:?}", ts);
                 }
             }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 9ce26e6c5d..808bb1e490 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -350,6 +350,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
         backup_lsn: sk_info.backup_lsn.0,
         local_start_lsn: sk_info.local_start_lsn.0,
         availability_zone: None,
+        standby_horizon: sk_info.standby_horizon.0,
     };
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
@@ -519,6 +520,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
+                check_permission(&r, None)?;
                 let cancel = CancellationToken::new();
                 failpoints_handler(r, cancel).await
             })
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 543714a54e..8d8d2cf23e 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -31,6 +31,8 @@ pub mod safekeeper;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
+pub mod timeline_manager;
+pub mod timelines_set;
 pub mod wal_backup;
 pub mod wal_backup_partial;
 pub mod wal_service;
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 28ae042bb3..1e965393e3 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -11,8 +11,9 @@ use futures::Future;
 use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
-    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
-    IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
+    register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
+    IntGaugeVec,
 };
 use once_cell::sync::Lazy;
 
@@ -162,6 +163,29 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
 });
+pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_manager_iterations_total",
+        "Number of iterations of the timeline manager task"
+    )
+    .expect("Failed to register safekeeper_manager_iterations_total counter")
+});
+pub static MANAGER_ACTIVE_CHANGES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_manager_active_changes_total",
+        "Number of timeline active status changes in the timeline manager task"
+    )
+    .expect("Failed to register safekeeper_manager_active_changes_total counter")
+});
+pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "safekeeper_wal_backup_tasks_started_total",
+        "Number of active WAL backup tasks",
+        "safekeeper_wal_backup_tasks_finished_total",
+        "Number of finished WAL backup tasks",
+    )
+    .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 
@@ -614,8 +638,7 @@ impl Collector for TimelineCollector {
         self.written_wal_seconds.reset();
         self.flushed_wal_seconds.reset();
 
-        let timelines = GlobalTimelines::get_all();
-        let timelines_count = timelines.len();
+        let timelines_count = GlobalTimelines::get_all().len();
         let mut active_timelines_count = 0;
 
         // Prometheus Collector is sync, and data is stored under async lock. To
@@ -746,9 +769,9 @@ impl Collector for TimelineCollector {
 
 async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
     let mut res = vec![];
-    let timelines = GlobalTimelines::get_all();
+    let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
 
-    for tli in timelines {
+    for tli in active_timelines {
         if let Some(info) = tli.info_for_metrics().await {
             res.push(info);
         }
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 015b53bb2e..03cfa882c4 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -45,6 +45,9 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 pub struct WalReceivers {
     mutex: Mutex<WalReceiversShared>,
     pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
+
+    num_computes_tx: tokio::sync::watch::Sender<usize>,
+    num_computes_rx: tokio::sync::watch::Receiver<usize>,
 }
 
 /// Id under which walreceiver is registered in shmem.
@@ -55,16 +58,21 @@ impl WalReceivers {
         let (pageserver_feedback_tx, _) =
             tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);
 
+        let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize);
+
         Arc::new(WalReceivers {
             mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
             pageserver_feedback_tx,
+            num_computes_tx,
+            num_computes_rx,
         })
     }
 
     /// Register new walreceiver. Returned guard provides access to the slot and
     /// automatically deregisters in Drop.
     pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
-        let slots = &mut self.mutex.lock().slots;
+        let mut shared = self.mutex.lock();
+        let slots = &mut shared.slots;
         let walreceiver = WalReceiverState {
             conn_id,
             status: WalReceiverStatus::Voting,
@@ -78,6 +86,9 @@ impl WalReceivers {
             slots.push(Some(walreceiver));
             pos
         };
+
+        self.update_num(&shared);
+
         WalReceiverGuard {
             id: pos,
             walreceivers: self.clone(),
@@ -99,7 +110,18 @@ impl WalReceivers {
 
     /// Get number of walreceivers (compute connections).
     pub fn get_num(self: &Arc<WalReceivers>) -> usize {
-        self.mutex.lock().slots.iter().flatten().count()
+        self.mutex.lock().get_num()
+    }
+
+    /// Get channel for number of walreceivers.
+    pub fn get_num_rx(self: &Arc<WalReceivers>) -> tokio::sync::watch::Receiver<usize> {
+        self.num_computes_rx.clone()
+    }
+
+    /// Should get called after every update of slots.
+    fn update_num(self: &Arc<WalReceivers>, shared: &MutexGuard<WalReceiversShared>) {
+        let num = shared.get_num();
+        self.num_computes_tx.send_replace(num);
     }
 
     /// Get state of all walreceivers.
@@ -123,6 +145,7 @@ impl WalReceivers {
     fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
+        self.update_num(&shared);
     }
 
     /// Broadcast pageserver feedback to connected walproposers.
@@ -137,6 +160,13 @@ struct WalReceiversShared {
     slots: Vec<Option<WalReceiverState>>,
 }
 
+impl WalReceiversShared {
+    /// Get number of walreceivers (compute connections).
+    fn get_num(&self) -> usize {
+        self.slots.iter().flatten().count()
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalReceiverState {
     /// None means it is recovery initiated by us (this safekeeper).
@@ -183,9 +213,19 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_wal_push_guts(pgb).await {
+        let mut tli: Option<Arc<Timeline>> = None;
+        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
             // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            let handle_end_fut = pgb.handle_copy_stream_end(end);
+            // If we managed to create the timeline, augment logging with current LSNs etc.
+            if let Some(tli) = tli {
+                let info = tli.get_safekeeper_info(&self.conf).await;
+                handle_end_fut
+                    .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn)))
+                    .await;
+            } else {
+                handle_end_fut.await;
+            }
         }
         Ok(())
     }
@@ -193,6 +233,7 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
+        tli: &mut Option<Arc<Timeline>>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // Notify the libpq client that it's allowed to send `CopyData` messages
         pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -222,13 +263,17 @@ impl SafekeeperPostgresHandler {
         // Read first message and create timeline if needed.
         let res = network_reader.read_first_message().await;
 
-        let res = if let Ok((tli, next_msg)) = res {
+        let network_res = if let Ok((timeline, next_msg)) = res {
             let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
-                tli.get_walreceivers().pageserver_feedback_tx.subscribe();
+                timeline
+                    .get_walreceivers()
+                    .pageserver_feedback_tx
+                    .subscribe();
+            *tli = Some(timeline.clone());
 
             tokio::select! {
                 // todo: add read|write .context to these errors
-                r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
                 r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
             }
         } else {
@@ -244,13 +289,13 @@ impl SafekeeperPostgresHandler {
         match acceptor_handle {
             None => {
                 // failed even before spawning; read_network should have error
-                Err(res.expect_err("no error with WalAcceptor not spawn"))
+                Err(network_res.expect_err("no error with WalAcceptor not spawn"))
             }
             Some(handle) => {
                 let wal_acceptor_res = handle.await;
 
                 // If there was any network error, return it.
-                res?;
+                network_res?;
 
                 // Otherwise, WalAcceptor thread must have errored.
                 match wal_acceptor_res {
@@ -441,14 +486,7 @@ impl WalAcceptor {
     /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
     /// it must mean that network thread terminated.
     async fn run(&mut self) -> anyhow::Result<()> {
-        // Register the connection and defer unregister.
-        // Order of the next two lines is important: we want first to remove our entry and then
-        // update status which depends on registered connections.
-        let _compute_conn_guard = ComputeConnectionGuard {
-            timeline: Arc::clone(&self.tli),
-        };
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
-        self.tli.update_status_notify().await?;
 
         // After this timestamp we will stop processing AppendRequests and send a response
         // to the walproposer. walproposer sends at least one AppendRequest per second,
@@ -514,19 +552,3 @@ impl WalAcceptor {
         }
     }
 }
-
-/// Calls update_status_notify in drop to update timeline status.
-struct ComputeConnectionGuard {
-    timeline: Arc<Timeline>,
-}
-
-impl Drop for ComputeConnectionGuard {
-    fn drop(&mut self) {
-        let tli = self.timeline.clone();
-        tokio::spawn(async move {
-            if let Err(e) = tli.update_status_notify().await {
-                error!("failed to update timeline status: {}", e);
-            }
-        });
-    }
-}
diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs
index e8fa6c55f4..dfa1892c40 100644
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -37,17 +37,11 @@ use crate::{
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
 pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     info!("started");
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
 
+    let cancel = tli.cancel.clone();
     select! {
         _ = recovery_main_loop(tli, conf) => { unreachable!() }
-        _ = cancellation_rx.changed() => {
+        _ = cancel.cancelled() => {
             info!("stopped");
         }
     }
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index 9dce06a886..98ce671182 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -7,29 +7,18 @@ use tracing::*;
 
 use crate::{GlobalTimelines, SafeKeeperConf};
 
-const ALLOW_INACTIVE_TIMELINES: bool = true;
-
-pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
+pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
     let wal_removal_interval = Duration::from_millis(5000);
     loop {
         let now = tokio::time::Instant::now();
-        let mut active_timelines = 0;
-
         let tlis = GlobalTimelines::get_all();
         for tli in &tlis {
-            let is_active = tli.is_active().await;
-            if is_active {
-                active_timelines += 1;
-            }
-            if !ALLOW_INACTIVE_TIMELINES && !is_active {
-                continue;
-            }
             let ttid = tli.ttid;
             async {
                 if let Err(e) = tli.maybe_persist_control_file().await {
                     warn!("failed to persist control file: {e}");
                 }
-                if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await {
+                if let Err(e) = tli.remove_old_wal().await {
                     error!("failed to remove WAL: {}", e);
                 }
             }
@@ -42,8 +31,8 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
         if elapsed > wal_removal_interval {
             info!(
-                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
-                active_timelines, total_timelines, elapsed
+                "WAL removal is too long, processed {} timelines in {:?}",
+                total_timelines, elapsed
             );
         }
 
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 59a8c595ab..5a9745e1c9 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -23,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use std::cmp::min;
+use std::cmp::{max, min};
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -85,8 +85,17 @@ impl StandbyReply {
 
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct StandbyFeedback {
-    reply: StandbyReply,
-    hs_feedback: HotStandbyFeedback,
+    pub reply: StandbyReply,
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl StandbyFeedback {
+    pub fn empty() -> Self {
+        StandbyFeedback {
+            reply: StandbyReply::empty(),
+            hs_feedback: HotStandbyFeedback::empty(),
+        }
+    }
 }
 
 /// WalSenders registry. Timeline holds it (wrapped in Arc).
@@ -162,8 +171,8 @@ impl WalSenders {
     }
 
     /// Get aggregated hot standby feedback (we send it to compute).
-    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
-        self.mutex.lock().agg_hs_feedback
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
+        self.mutex.lock().agg_standby_feedback
     }
 
     /// Record new pageserver feedback, update aggregated values.
@@ -184,6 +193,10 @@ impl WalSenders {
     fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
         let mut shared = self.mutex.lock();
         let slot = shared.get_slot_mut(id);
+        debug!(
+            "Record standby reply: ts={} apply_lsn={}",
+            reply.reply_ts, reply.apply_lsn
+        );
         match &mut slot.feedback {
             ReplicationFeedback::Standby(sf) => sf.reply = *reply,
             ReplicationFeedback::Pageserver(_) => {
@@ -208,7 +221,7 @@ impl WalSenders {
                 })
             }
         }
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
     }
 
     /// Get remote_consistent_lsn reported by the pageserver. Returns None if
@@ -226,13 +239,13 @@ impl WalSenders {
     fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
     }
 }
 
 struct WalSendersShared {
     // aggregated over all walsenders value
-    agg_hs_feedback: HotStandbyFeedback,
+    agg_standby_feedback: StandbyFeedback,
     // last feedback ever received from any pageserver, empty if none
     last_ps_feedback: PageserverFeedback,
     // total counter of pageserver feedbacks received
@@ -243,7 +256,7 @@ struct WalSendersShared {
 impl WalSendersShared {
     fn new() -> Self {
         WalSendersShared {
-            agg_hs_feedback: HotStandbyFeedback::empty(),
+            agg_standby_feedback: StandbyFeedback::empty(),
             last_ps_feedback: PageserverFeedback::empty(),
             ps_feedback_counter: 0,
             slots: Vec::new(),
@@ -260,10 +273,11 @@ impl WalSendersShared {
         self.slots[id].as_mut().expect("walsender doesn't exist")
     }
 
-    /// Update aggregated hot standy feedback. We just take min of valid xmins
+    /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
     /// and ts.
-    fn update_hs_feedback(&mut self) {
+    fn update_reply_feedback(&mut self) {
         let mut agg = HotStandbyFeedback::empty();
+        let mut reply_agg = StandbyReply::empty();
         for ws_state in self.slots.iter().flatten() {
             if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
                 let hs_feedback = standby_feedback.hs_feedback;
@@ -276,7 +290,7 @@ impl WalSendersShared {
                     } else {
                         agg.xmin = hs_feedback.xmin;
                     }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
                 }
                 if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
                     if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
@@ -284,11 +298,43 @@ impl WalSendersShared {
                     } else {
                         agg.catalog_xmin = hs_feedback.catalog_xmin;
                     }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
+                }
+                let reply = standby_feedback.reply;
+                if reply.write_lsn != Lsn::INVALID {
+                    if reply_agg.write_lsn != Lsn::INVALID {
+                        reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
+                    } else {
+                        reply_agg.write_lsn = reply.write_lsn;
+                    }
+                }
+                if reply.flush_lsn != Lsn::INVALID {
+                    if reply_agg.flush_lsn != Lsn::INVALID {
+                        reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
+                    } else {
+                        reply_agg.flush_lsn = reply.flush_lsn;
+                    }
+                }
+                if reply.apply_lsn != Lsn::INVALID {
+                    if reply_agg.apply_lsn != Lsn::INVALID {
+                        reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
+                    } else {
+                        reply_agg.apply_lsn = reply.apply_lsn;
+                    }
+                }
+                if reply.reply_ts != 0 {
+                    if reply_agg.reply_ts != 0 {
+                        reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
+                    } else {
+                        reply_agg.reply_ts = reply.reply_ts;
+                    }
                 }
             }
         }
-        self.agg_hs_feedback = agg;
+        self.agg_standby_feedback = StandbyFeedback {
+            reply: reply_agg,
+            hs_feedback: agg,
+        };
     }
 }
 
@@ -340,12 +386,16 @@ impl SafekeeperPostgresHandler {
         start_pos: Lsn,
         term: Option<Term>,
     ) -> Result<(), QueryError> {
+        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
         if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term)
+            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
             .await
         {
+            let info = tli.get_safekeeper_info(&self.conf).await;
             // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            pgb.handle_copy_stream_end(end)
+            .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn)))
+            .await;
         }
         Ok(())
     }
@@ -355,10 +405,9 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
         term: Option<Term>,
+        tli: Arc<Timeline>,
     ) -> Result<(), CopyStreamHandlerEnd> {
         let appname = self.appname.clone();
-        let tli =
-            GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
 
         // Use a guard object to remove our entry from the timeline when we are done.
         let ws_guard = Arc::new(tli.get_walsenders().register(
@@ -707,8 +756,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
         match msg.first().cloned() {
             Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
                 // Note: deserializing is on m[1..] because we skip the tag byte.
-                let hs_feedback = HotStandbyFeedback::des(&msg[1..])
+                let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
                     .context("failed to deserialize HotStandbyFeedback")?;
+                // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
+                // pq_sendint32(&reply_message, xmin);
+                // pq_sendint32(&reply_message, xmin_epoch);
+                // So it is two big endian 32-bit words in low endian order!
+                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
+                hs_feedback.catalog_xmin =
+                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
                 self.ws_guard
                     .walsenders
                     .record_hs_feedback(self.ws_guard.id, &hs_feedback);
@@ -790,8 +846,11 @@ mod tests {
     fn test_hs_feedback_no_valid() {
         let mut wss = WalSendersShared::new();
         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
+        wss.update_reply_feedback();
+        assert_eq!(
+            wss.agg_standby_feedback.hs_feedback.xmin,
+            INVALID_FULL_TRANSACTION_ID
+        );
     }
 
     #[test]
@@ -800,7 +859,7 @@ mod tests {
         push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
         push_feedback(&mut wss, hs_feedback(1, 42));
         push_feedback(&mut wss, hs_feedback(1, 64));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, 42);
+        wss.update_reply_feedback();
+        assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
     }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 64f764f191..89c157d514 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -6,15 +6,15 @@ use camino::Utf8PathBuf;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
+use tokio_util::sync::CancellationToken;
 
 use std::cmp::max;
+use std::ops::{Deref, DerefMut};
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::sync::{Mutex, MutexGuard};
-use tokio::{
-    sync::{mpsc::Sender, watch},
-    time::Instant,
-};
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tokio::{sync::watch, time::Instant};
 use tracing::*;
 use utils::http::error::ApiError;
 use utils::{
@@ -33,12 +33,13 @@ use crate::safekeeper::{
 };
 use crate::send_wal::WalSenders;
 use crate::state::{TimelineMemState, TimelinePersistentState};
+use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, wal_backup_partial, wal_storage};
+use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
@@ -51,8 +52,7 @@ pub struct PeerInfo {
     /// LSN of the last record.
     pub flush_lsn: Lsn,
     pub commit_lsn: Lsn,
-    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
-    /// sk since backup_lsn.
+    /// Since which LSN safekeeper has WAL.
     pub local_start_lsn: Lsn,
     /// When info was received. Serde annotations are not very useful but make
     /// the code compile -- we don't rely on this field externally.
@@ -97,25 +97,72 @@ impl PeersInfo {
     }
 }
 
+pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
+
+/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard<SharedState>` that
+/// automatically updates `watch::Sender` channels with state on drop.
+pub struct WriteGuardSharedState<'a> {
+    tli: Arc<Timeline>,
+    guard: RwLockWriteGuard<'a, SharedState>,
+}
+
+impl<'a> WriteGuardSharedState<'a> {
+    fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
+        WriteGuardSharedState { tli, guard }
+    }
+}
+
+impl<'a> Deref for WriteGuardSharedState<'a> {
+    type Target = SharedState;
+
+    fn deref(&self) -> &Self::Target {
+        &self.guard
+    }
+}
+
+impl<'a> DerefMut for WriteGuardSharedState<'a> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.guard
+    }
+}
+
+impl<'a> Drop for WriteGuardSharedState<'a> {
+    fn drop(&mut self) {
+        let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn()));
+        let commit_lsn = self.guard.sk.state.inmem.commit_lsn;
+
+        let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| {
+            if *old != term_flush_lsn {
+                *old = term_flush_lsn;
+                true
+            } else {
+                false
+            }
+        });
+
+        let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| {
+            if *old != commit_lsn {
+                *old = commit_lsn;
+                true
+            } else {
+                false
+            }
+        });
+
+        // send notification about shared state update
+        self.tli.shared_state_version_tx.send_modify(|old| {
+            *old += 1;
+        });
+    }
+}
+
 /// Shared state associated with database instance
 pub struct SharedState {
     /// Safekeeper object
-    sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
+    pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
     /// In memory list containing state of peers sent in latest messages from them.
-    peers_info: PeersInfo,
-    /// True when WAL backup launcher oversees the timeline, making sure WAL is
-    /// offloaded, allows to bother launcher less.
-    wal_backup_active: bool,
-    /// True whenever there is at least some pending activity on timeline: live
-    /// compute connection, pageserver is not caughtup (it must have latest WAL
-    /// for new compute start) or WAL backuping is not finished. Practically it
-    /// means safekeepers broadcast info to peers about the timeline, old WAL is
-    /// trimmed.
-    ///
-    /// TODO: it might be better to remove tli completely from GlobalTimelines
-    /// when tli is inactive instead of having this flag.
-    active: bool,
-    last_removed_segno: XLogSegNo,
+    pub(crate) peers_info: PeersInfo,
+    pub(crate) last_removed_segno: XLogSegNo,
 }
 
 impl SharedState {
@@ -152,8 +199,6 @@ impl SharedState {
         Ok(Self {
             sk,
             peers_info: PeersInfo(vec![]),
-            wal_backup_active: false,
-            active: false,
             last_removed_segno: 0,
         })
     }
@@ -171,75 +216,10 @@ impl SharedState {
         Ok(Self {
             sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
             peers_info: PeersInfo(vec![]),
-            wal_backup_active: false,
-            active: false,
             last_removed_segno: 0,
         })
     }
 
-    fn is_active(&self, num_computes: usize) -> bool {
-        self.is_wal_backup_required(num_computes)
-            // FIXME: add tracking of relevant pageservers and check them here individually,
-            // otherwise migration won't work (we suspend too early).
-            || self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn
-    }
-
-    /// Mark timeline active/inactive and return whether s3 offloading requires
-    /// start/stop action. If timeline is deactivated, control file is persisted
-    /// as maintenance task does that only for active timelines.
-    async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool {
-        let is_active = self.is_active(num_computes);
-        if self.active != is_active {
-            info!(
-                "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                ttid,
-                is_active,
-                self.sk.state.inmem.remote_consistent_lsn,
-                self.sk.state.inmem.commit_lsn
-            );
-            if !is_active {
-                if let Err(e) = self.sk.state.flush().await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
-        self.active = is_active;
-        self.is_wal_backup_action_pending(num_computes)
-    }
-
-    /// Should we run s3 offloading in current state?
-    fn is_wal_backup_required(&self, num_computes: usize) -> bool {
-        let seg_size = self.get_wal_seg_size();
-        num_computes > 0 ||
-        // Currently only the whole segment is offloaded, so compare segment numbers.
-            (self.sk.state.inmem.commit_lsn.segment_number(seg_size) >
-             self.sk.state.inmem.backup_lsn.segment_number(seg_size))
-    }
-
-    /// Is current state of s3 offloading is not what it ought to be?
-    fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
-        let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
-        if res {
-            let action_pending = if self.is_wal_backup_required(num_computes) {
-                "start"
-            } else {
-                "stop"
-            };
-            trace!(
-                "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
-                self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn
-            );
-        }
-        res
-    }
-
-    /// Returns whether s3 offloading is required and sets current status as
-    /// matching.
-    fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
-        self.wal_backup_active = self.is_wal_backup_required(num_computes);
-        self.wal_backup_active
-    }
-
     fn get_wal_seg_size(&self) -> usize {
         self.sk.state.server.wal_seg_size as usize
     }
@@ -248,6 +228,7 @@ impl SharedState {
         &self,
         ttid: &TenantTimelineId,
         conf: &SafeKeeperConf,
+        standby_apply_lsn: Lsn,
     ) -> SafekeeperTimelineInfo {
         SafekeeperTimelineInfo {
             safekeeper_id: conf.my_id.0,
@@ -270,13 +251,14 @@ impl SharedState {
             backup_lsn: self.sk.state.inmem.backup_lsn.0,
             local_start_lsn: self.sk.state.local_start_lsn.0,
             availability_zone: conf.availability_zone.clone(),
+            standby_horizon: standby_apply_lsn.0,
         }
     }
 
     /// Get our latest view of alive peers status on the timeline.
     /// We pass our own info through the broker as well, so when we don't have connection
     /// to the broker returned vec is empty.
-    fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
+    pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
         let now = Instant::now();
         self.peers_info
             .0
@@ -292,18 +274,13 @@ impl SharedState {
     /// offloading.
     /// While it is safe to use inmem values for determining horizon,
     /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(
-        &self,
-        wal_backup_enabled: bool,
-        extra_horizon_lsn: Option<Lsn>,
-    ) -> XLogSegNo {
+    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
         let state = &self.sk.state;
 
         use std::cmp::min;
         let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        if wal_backup_enabled {
-            horizon_lsn = min(horizon_lsn, state.backup_lsn);
-        }
+        // we don't want to remove WAL that is not yet offloaded to s3
+        horizon_lsn = min(horizon_lsn, state.backup_lsn);
         if let Some(extra_horizon_lsn) = extra_horizon_lsn {
             horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
         }
@@ -344,11 +321,6 @@ impl From<TimelineError> for ApiError {
 pub struct Timeline {
     pub ttid: TenantTimelineId,
 
-    /// Sending here asks for wal backup launcher attention (start/stop
-    /// offloading). Sending ttid instead of concrete command allows to do
-    /// sending without timeline lock.
-    pub wal_backup_launcher_tx: Sender<TenantTimelineId>,
-
     /// Used to broadcast commit_lsn updates to all background jobs.
     commit_lsn_watch_tx: watch::Sender<Lsn>,
     commit_lsn_watch_rx: watch::Receiver<Lsn>,
@@ -360,19 +332,19 @@ pub struct Timeline {
     term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
     term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
 
+    /// Broadcasts shared state updates.
+    shared_state_version_tx: watch::Sender<usize>,
+    shared_state_version_rx: watch::Receiver<usize>,
+
     /// Safekeeper and other state, that should remain consistent and
     /// synchronized with the disk. This is tokio mutex as we write WAL to disk
     /// while holding it, ensuring that consensus checks are in order.
-    mutex: Mutex<SharedState>,
+    mutex: RwLock<SharedState>,
     walsenders: Arc<WalSenders>,
     walreceivers: Arc<WalReceivers>,
 
-    /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
-    cancellation_tx: watch::Sender<bool>,
-
-    /// Timeline should not be used after cancellation. Background tasks should
-    /// monitor this channel and stop eventually after receiving `true` from this channel.
-    cancellation_rx: watch::Receiver<bool>,
+    /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
+    pub(crate) cancel: CancellationToken,
 
     /// Directory where timeline state is stored.
     pub timeline_dir: Utf8PathBuf,
@@ -382,15 +354,15 @@ pub struct Timeline {
     /// with different speed.
     // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
     walsenders_keep_horizon: bool,
+
+    // timeline_manager controlled state
+    pub(crate) broker_active: AtomicBool,
+    pub(crate) wal_backup_active: AtomicBool,
 }
 
 impl Timeline {
     /// Load existing timeline from disk.
-    pub fn load_timeline(
-        conf: &SafeKeeperConf,
-        ttid: TenantTimelineId,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
-    ) -> Result<Timeline> {
+    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
         let shared_state = SharedState::restore(conf, &ttid)?;
@@ -400,23 +372,25 @@ impl Timeline {
             shared_state.sk.get_term(),
             shared_state.sk.flush_lsn(),
         )));
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);
+        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
-            wal_backup_launcher_tx,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
-            mutex: Mutex::new(shared_state),
+            shared_state_version_tx,
+            shared_state_version_rx,
+            mutex: RwLock::new(shared_state),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            broker_active: AtomicBool::new(false),
+            wal_backup_active: AtomicBool::new(false),
         })
     }
 
@@ -424,7 +398,6 @@ impl Timeline {
     pub fn create_empty(
         conf: &SafeKeeperConf,
         ttid: TenantTimelineId,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
         server_info: ServerInfo,
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
@@ -432,25 +405,28 @@ impl Timeline {
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
         let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
             watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);
+        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
+
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
 
         let walreceivers = WalReceivers::new();
         Ok(Timeline {
             ttid,
-            wal_backup_launcher_tx,
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
             term_flush_lsn_watch_rx,
-            mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
+            shared_state_version_tx,
+            shared_state_version_rx,
+            mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
             timeline_dir: conf.timeline_dir(&ttid),
             walsenders_keep_horizon: conf.walsenders_keep_horizon,
+            broker_active: AtomicBool::new(false),
+            wal_backup_active: AtomicBool::new(false),
         })
     }
 
@@ -461,8 +437,9 @@ impl Timeline {
     /// and state on disk should remain unchanged.
     pub async fn init_new(
         self: &Arc<Timeline>,
-        shared_state: &mut MutexGuard<'_, SharedState>,
+        shared_state: &mut WriteGuardSharedState<'_>,
         conf: &SafeKeeperConf,
+        broker_active_set: Arc<TimelinesSet>,
     ) -> Result<()> {
         match fs::metadata(&self.timeline_dir).await {
             Ok(_) => {
@@ -493,16 +470,29 @@ impl Timeline {
 
             return Err(e);
         }
-        self.bootstrap(conf);
+        self.bootstrap(conf, broker_active_set);
         Ok(())
     }
 
-    /// Bootstrap new or existing timeline starting background stasks.
-    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
+    /// Bootstrap new or existing timeline starting background tasks.
+    pub fn bootstrap(
+        self: &Arc<Timeline>,
+        conf: &SafeKeeperConf,
+        broker_active_set: Arc<TimelinesSet>,
+    ) {
+        // Start manager task which will monitor timeline state and update
+        // background tasks.
+        tokio::spawn(timeline_manager::main_task(
+            self.clone(),
+            conf.clone(),
+            broker_active_set,
+        ));
+
         // Start recovery task which always runs on the timeline.
         if conf.peer_recovery_enabled {
             tokio::spawn(recovery_main(self.clone(), conf.clone()));
         }
+        // TODO: migrate to timeline_manager
         if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
             tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
         }
@@ -515,10 +505,9 @@ impl Timeline {
     /// deletion API endpoint is retriable.
     pub async fn delete(
         &self,
-        shared_state: &mut MutexGuard<'_, SharedState>,
+        shared_state: &mut WriteGuardSharedState<'_>,
         only_local: bool,
-    ) -> Result<(bool, bool)> {
-        let was_active = shared_state.active;
+    ) -> Result<bool> {
         self.cancel(shared_state);
 
         // TODO: It's better to wait for s3 offloader termination before
@@ -532,20 +521,14 @@ impl Timeline {
             wal_backup::delete_timeline(&self.ttid).await?;
         }
         let dir_existed = delete_dir(&self.timeline_dir).await?;
-        Ok((dir_existed, was_active))
+        Ok(dir_existed)
     }
 
     /// Cancel timeline to prevent further usage. Background tasks will stop
     /// eventually after receiving cancellation signal.
-    ///
-    /// Note that we can't notify backup launcher here while holding
-    /// shared_state lock, as this is a potential deadlock: caller is
-    /// responsible for that. Generally we should probably make WAL backup tasks
-    /// to shut down on their own, checking once in a while whether it is the
-    /// time.
-    fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
+    fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
         info!("timeline {} is cancelled", self.ttid);
-        let _ = self.cancellation_tx.send(true);
+        self.cancel.cancel();
         // Close associated FDs. Nobody will be able to touch timeline data once
         // it is cancelled, so WAL storage won't be opened again.
         shared_state.sk.wal_store.close();
@@ -553,44 +536,16 @@ impl Timeline {
 
     /// Returns if timeline is cancelled.
     pub fn is_cancelled(&self) -> bool {
-        *self.cancellation_rx.borrow()
-    }
-
-    /// Returns watch channel which gets value when timeline is cancelled. It is
-    /// guaranteed to have not cancelled value observed (errors otherwise).
-    pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
-        let rx = self.cancellation_rx.clone();
-        if *rx.borrow() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        Ok(rx)
+        self.cancel.is_cancelled()
     }
 
     /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
-        self.mutex.lock().await
+    pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
+        WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
     }
 
-    async fn update_status(&self, shared_state: &mut SharedState) -> bool {
-        shared_state
-            .update_status(self.walreceivers.get_num(), self.ttid)
-            .await
-    }
-
-    /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
-    pub async fn update_status_notify(&self) -> Result<()> {
-        if self.is_cancelled() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        let is_wal_backup_action_pending: bool = {
-            let mut shared_state = self.write_shared_state().await;
-            self.update_status(&mut shared_state).await
-        };
-        if is_wal_backup_action_pending {
-            // Can fail only if channel to a static thread got closed, which is not normal at all.
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
-        }
-        Ok(())
+    pub async fn read_shared_state(&self) -> ReadGuardSharedState {
+        self.mutex.read().await
     }
 
     /// Returns true if walsender should stop sending WAL to pageserver. We
@@ -602,7 +557,7 @@ impl Timeline {
         if self.is_cancelled() {
             return true;
         }
-        let shared_state = self.write_shared_state().await;
+        let shared_state = self.read_shared_state().await;
         if self.walreceivers.get_num() == 0 {
             return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
             reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
@@ -610,9 +565,9 @@ impl Timeline {
         false
     }
 
-    /// Ensure taht current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
-        let ss = self.write_shared_state().await;
+    /// Ensure that current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
+        let ss = self.read_shared_state().await;
         if ss.sk.state.acceptor_state.term != t {
             bail!(
                 "failed to acquire term {}, current term {}",
@@ -623,18 +578,6 @@ impl Timeline {
         Ok(ss)
     }
 
-    /// Returns whether s3 offloading is required and sets current status as
-    /// matching it.
-    pub async fn wal_backup_attend(&self) -> bool {
-        if self.is_cancelled() {
-            return false;
-        }
-
-        self.write_shared_state()
-            .await
-            .wal_backup_attend(self.walreceivers.get_num())
-    }
-
     /// Returns commit_lsn watch channel.
     pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
         self.commit_lsn_watch_rx.clone()
@@ -645,9 +588,14 @@ impl Timeline {
         self.term_flush_lsn_watch_rx.clone()
     }
 
+    /// Returns watch channel for SharedState update version.
+    pub fn get_state_version_rx(&self) -> watch::Receiver<usize> {
+        self.shared_state_version_rx.clone()
+    }
+
     /// Pass arrived message to the safekeeper.
     pub async fn process_msg(
-        &self,
+        self: &Arc<Self>,
         msg: &ProposerAcceptorMessage,
     ) -> Result<Option<AcceptorProposerMessage>> {
         if self.is_cancelled() {
@@ -655,53 +603,36 @@ impl Timeline {
         }
 
         let mut rmsg: Option<AcceptorProposerMessage>;
-        let commit_lsn: Lsn;
-        let term_flush_lsn: TermLsn;
         {
             let mut shared_state = self.write_shared_state().await;
             rmsg = shared_state.sk.process_msg(msg).await?;
 
             // if this is AppendResponse, fill in proper hot standby feedback.
             if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby();
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
             }
-
-            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
-            term_flush_lsn =
-                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
         }
-        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
         Ok(rmsg)
     }
 
     /// Returns wal_seg_size.
     pub async fn get_wal_seg_size(&self) -> usize {
-        self.write_shared_state().await.get_wal_seg_size()
-    }
-
-    /// Returns true only if the timeline is loaded and active.
-    pub async fn is_active(&self) -> bool {
-        if self.is_cancelled() {
-            return false;
-        }
-
-        self.write_shared_state().await.active
+        self.read_shared_state().await.get_wal_seg_size()
     }
 
     /// Returns state of the timeline.
     pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) {
-        let state = self.write_shared_state().await;
+        let state = self.read_shared_state().await;
         (state.sk.state.inmem.clone(), state.sk.state.clone())
     }
 
     /// Returns latest backup_lsn.
     pub async fn get_wal_backup_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.state.inmem.backup_lsn
+        self.read_shared_state().await.sk.state.inmem.backup_lsn
     }
 
     /// Sets backup_lsn to the given value.
-    pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
+    pub async fn set_wal_backup_lsn(self: &Arc<Self>, backup_lsn: Lsn) -> Result<()> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -715,39 +646,34 @@ impl Timeline {
 
     /// Get safekeeper info for broadcasting to broker and other peers.
     pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
-        let shared_state = self.write_shared_state().await;
-        shared_state.get_safekeeper_info(&self.ttid, conf)
+        let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
+        let shared_state = self.read_shared_state().await;
+        shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
     }
 
     /// Update timeline state with peer safekeeper data.
-    pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> {
-        let is_wal_backup_action_pending: bool;
-        let commit_lsn: Lsn;
+    pub async fn record_safekeeper_info(
+        self: &Arc<Self>,
+        sk_info: SafekeeperTimelineInfo,
+    ) -> Result<()> {
         {
             let mut shared_state = self.write_shared_state().await;
             shared_state.sk.record_safekeeper_info(&sk_info).await?;
             let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
             shared_state.peers_info.upsert(&peer_info);
-            is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
-            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
-        }
-        self.commit_lsn_watch_tx.send(commit_lsn)?;
-        // Wake up wal backup launcher, if it is time to stop the offloading.
-        if is_wal_backup_action_pending {
-            self.wal_backup_launcher_tx.send(self.ttid).await?;
         }
         Ok(())
     }
 
     /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
+    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
         let mut shared_state = self.write_shared_state().await;
         shared_state.sk.state.inmem.remote_consistent_lsn =
             max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
     }
 
     pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
-        let shared_state = self.write_shared_state().await;
+        let shared_state = self.read_shared_state().await;
         shared_state.get_peers(conf.heartbeat_timeout)
     }
 
@@ -769,7 +695,7 @@ impl Timeline {
     /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
     /// Thus we don't try to predict it here.
     pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.write_shared_state().await;
+        let ss = self.read_shared_state().await;
         let term = ss.sk.state.acceptor_state.term;
         let last_log_term = ss.sk.get_epoch();
         let flush_lsn = ss.sk.flush_lsn();
@@ -840,12 +766,12 @@ impl Timeline {
 
     /// Returns flush_lsn.
     pub async fn get_flush_lsn(&self) -> Lsn {
-        self.write_shared_state().await.sk.wal_store.flush_lsn()
+        self.read_shared_state().await.sk.wal_store.flush_lsn()
     }
 
     /// Delete WAL segments from disk that are no longer needed. This is determined
     /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
+    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -861,9 +787,8 @@ impl Timeline {
 
         let horizon_segno: XLogSegNo;
         let remover = {
-            let shared_state = self.write_shared_state().await;
-            horizon_segno =
-                shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
+            let shared_state = self.read_shared_state().await;
+            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
             if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                 return Ok(()); // nothing to do
             }
@@ -885,7 +810,7 @@ impl Timeline {
     /// passed after the last save. This helps to keep remote_consistent_lsn up
     /// to date so that storage nodes restart doesn't cause many pageserver ->
     /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(&self) -> Result<()> {
+    pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
         self.write_shared_state()
             .await
             .sk
@@ -893,38 +818,33 @@ impl Timeline {
             .await
     }
 
-    /// Gather timeline data for metrics. If the timeline is not active, returns
-    /// None, we do not collect these.
+    /// Gather timeline data for metrics.
     pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
         if self.is_cancelled() {
             return None;
         }
 
         let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
-        let state = self.write_shared_state().await;
-        if state.active {
-            Some(FullTimelineInfo {
-                ttid: self.ttid,
-                ps_feedback_count,
-                last_ps_feedback,
-                wal_backup_active: state.wal_backup_active,
-                timeline_is_active: state.active,
-                num_computes: self.walreceivers.get_num() as u32,
-                last_removed_segno: state.last_removed_segno,
-                epoch_start_lsn: state.sk.epoch_start_lsn,
-                mem_state: state.sk.state.inmem.clone(),
-                persisted_state: state.sk.state.clone(),
-                flush_lsn: state.sk.wal_store.flush_lsn(),
-                wal_storage: state.sk.wal_store.get_metrics(),
-            })
-        } else {
-            None
-        }
+        let state = self.read_shared_state().await;
+        Some(FullTimelineInfo {
+            ttid: self.ttid,
+            ps_feedback_count,
+            last_ps_feedback,
+            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
+            timeline_is_active: self.broker_active.load(Ordering::Relaxed),
+            num_computes: self.walreceivers.get_num() as u32,
+            last_removed_segno: state.last_removed_segno,
+            epoch_start_lsn: state.sk.epoch_start_lsn,
+            mem_state: state.sk.state.inmem.clone(),
+            persisted_state: state.sk.state.clone(),
+            flush_lsn: state.sk.wal_store.flush_lsn(),
+            wal_storage: state.sk.wal_store.get_metrics(),
+        })
     }
 
     /// Returns in-memory timeline state to build a full debug dump.
     pub async fn memory_dump(&self) -> debug_dump::Memory {
-        let state = self.write_shared_state().await;
+        let state = self.read_shared_state().await;
 
         let (write_lsn, write_record_lsn, flush_lsn, file_open) =
             state.sk.wal_store.internal_state();
@@ -933,8 +853,8 @@ impl Timeline {
             is_cancelled: self.is_cancelled(),
             peers_info_len: state.peers_info.0.len(),
             walsenders: self.walsenders.get_all(),
-            wal_backup_active: state.wal_backup_active,
-            active: state.active,
+            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
+            active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: state.last_removed_segno,
             epoch_start_lsn: state.sk.epoch_start_lsn,
@@ -948,7 +868,7 @@ impl Timeline {
 
     /// Apply a function to the control file state and persist it.
     pub async fn map_control_file<T>(
-        &self,
+        self: &Arc<Self>,
         f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
     ) -> Result<T> {
         let mut state = self.write_shared_state().await;
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
new file mode 100644
index 0000000000..e74ba37ad8
--- /dev/null
+++ b/safekeeper/src/timeline_manager.rs
@@ -0,0 +1,145 @@
+//! The timeline manager task is responsible for managing the timeline's background tasks.
+//! It is spawned alongside each timeline and exits when the timeline is deleted.
+//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
+//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
+
+use std::{sync::Arc, time::Duration};
+
+use tracing::{info, instrument, warn};
+use utils::lsn::Lsn;
+
+use crate::{
+    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    timeline::{PeerInfo, ReadGuardSharedState, Timeline},
+    timelines_set::TimelinesSet,
+    wal_backup::{self, WalBackupTaskHandle},
+    SafeKeeperConf,
+};
+
+pub struct StateSnapshot {
+    pub commit_lsn: Lsn,
+    pub backup_lsn: Lsn,
+    pub remote_consistent_lsn: Lsn,
+    pub peers: Vec<PeerInfo>,
+}
+
+impl StateSnapshot {
+    /// Create a new snapshot of the timeline state.
+    fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self {
+        Self {
+            commit_lsn: read_guard.sk.state.inmem.commit_lsn,
+            backup_lsn: read_guard.sk.state.inmem.backup_lsn,
+            remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
+            peers: read_guard.get_peers(heartbeat_timeout),
+        }
+    }
+}
+
+/// Control how often the manager task should wake up to check updates.
+/// There is no need to check for updates more often than this.
+const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
+
+/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
+/// background tasks.
+#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
+pub async fn main_task(
+    tli: Arc<Timeline>,
+    conf: SafeKeeperConf,
+    broker_active_set: Arc<TimelinesSet>,
+) {
+    scopeguard::defer! {
+        if tli.is_cancelled() {
+            info!("manager task finished");
+        } else {
+            warn!("manager task finished prematurely");
+        }
+    };
+
+    // sets whether timeline is active for broker pushes or not
+    let mut tli_broker_active = broker_active_set.guard(tli.clone());
+
+    let ttid = tli.ttid;
+    let wal_seg_size = tli.get_wal_seg_size().await;
+    let heartbeat_timeout = conf.heartbeat_timeout;
+
+    let mut state_version_rx = tli.get_state_version_rx();
+
+    let walreceivers = tli.get_walreceivers();
+    let mut num_computes_rx = walreceivers.get_num_rx();
+
+    // list of background tasks
+    let mut backup_task: Option<WalBackupTaskHandle> = None;
+
+    let last_state = 'outer: loop {
+        MANAGER_ITERATIONS_TOTAL.inc();
+
+        let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
+        let num_computes = *num_computes_rx.borrow();
+
+        let is_wal_backup_required =
+            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
+
+        if conf.is_wal_backup_enabled() {
+            wal_backup::update_task(
+                &conf,
+                ttid,
+                is_wal_backup_required,
+                &state_snapshot,
+                &mut backup_task,
+            )
+            .await;
+        }
+
+        let is_active = is_wal_backup_required
+            || num_computes > 0
+            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
+
+        // update the broker timeline set
+        if tli_broker_active.set(is_active) {
+            // write log if state has changed
+            info!(
+                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
+                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
+            );
+
+            MANAGER_ACTIVE_CHANGES.inc();
+
+            if !is_active {
+                // TODO: maybe use tokio::spawn?
+                if let Err(e) = tli.maybe_persist_control_file().await {
+                    warn!("control file save in update_status failed: {:?}", e);
+                }
+            }
+        }
+
+        // update the state in Arc<Timeline>
+        tli.wal_backup_active
+            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
+        tli.broker_active
+            .store(is_active, std::sync::atomic::Ordering::Relaxed);
+
+        // wait until something changes. tx channels are stored under Arc, so they will not be
+        // dropped until the manager task is finished.
+        tokio::select! {
+            _ = tli.cancel.cancelled() => {
+                // timeline was deleted
+                break 'outer state_snapshot;
+            }
+            _ = async {
+                // don't wake up on every state change, but at most every REFRESH_INTERVAL
+                tokio::time::sleep(REFRESH_INTERVAL).await;
+                let _ = state_version_rx.changed().await;
+            } => {
+                // state was updated
+            }
+            _ = num_computes_rx.changed() => {
+                // number of connected computes was updated
+            }
+        }
+    };
+
+    // shutdown background tasks
+    if conf.is_wal_backup_enabled() {
+        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
+    }
+}
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 079e706ff8..8d37bd6371 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -4,6 +4,7 @@
 
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{Timeline, TimelineError};
+use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -11,16 +12,16 @@ use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
+use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
-use tokio::sync::mpsc::Sender;
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
-    wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
     conf: Option<SafeKeeperConf>,
+    broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
 }
 
@@ -36,11 +37,8 @@ impl GlobalTimelinesState {
     }
 
     /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Sender<TenantTimelineId>) {
-        (
-            self.get_conf().clone(),
-            self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-        )
+    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
+        (self.get_conf().clone(), self.broker_active_set.clone())
     }
 
     /// Insert timeline into the map. Returns error if timeline with the same id already exists.
@@ -65,8 +63,8 @@ impl GlobalTimelinesState {
 static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
     Mutex::new(GlobalTimelinesState {
         timelines: HashMap::new(),
-        wal_backup_launcher_tx: None,
         conf: None,
+        broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
     })
 });
@@ -76,16 +74,11 @@ pub struct GlobalTimelines;
 
 impl GlobalTimelines {
     /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub async fn init(
-        conf: SafeKeeperConf,
-        wal_backup_launcher_tx: Sender<TenantTimelineId>,
-    ) -> Result<()> {
+    pub async fn init(conf: SafeKeeperConf) -> Result<()> {
         // clippy isn't smart enough to understand that drop(state) releases the
         // lock, so use explicit block
         let tenants_dir = {
             let mut state = TIMELINES_STATE.lock().unwrap();
-            assert!(state.wal_backup_launcher_tx.is_none());
-            state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
             state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
@@ -129,12 +122,9 @@ impl GlobalTimelines {
     /// this function is called during init when nothing else is running, so
     /// this is fine.
     async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
-        let (conf, wal_backup_launcher_tx) = {
+        let (conf, broker_active_set) = {
             let state = TIMELINES_STATE.lock().unwrap();
-            (
-                state.get_conf().clone(),
-                state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-            )
+            state.get_dependencies()
         };
 
         let timelines_dir = conf.tenant_dir(&tenant_id);
@@ -147,7 +137,7 @@ impl GlobalTimelines {
                         TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
+                        match Timeline::load_timeline(&conf, ttid) {
                             Ok(timeline) => {
                                 let tli = Arc::new(timeline);
                                 TIMELINES_STATE
@@ -155,8 +145,7 @@ impl GlobalTimelines {
                                     .unwrap()
                                     .timelines
                                     .insert(ttid, tli.clone());
-                                tli.bootstrap(&conf);
-                                tli.update_status_notify().await.unwrap();
+                                tli.bootstrap(&conf, broker_active_set.clone());
                             }
                             // If we can't load a timeline, it's most likely because of a corrupted
                             // directory. We will log an error and won't allow to delete/recreate
@@ -189,9 +178,9 @@ impl GlobalTimelines {
         _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
         ttid: TenantTimelineId,
     ) -> Result<Arc<Timeline>> {
-        let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
+        let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
 
-        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
+        match Timeline::load_timeline(&conf, ttid) {
             Ok(timeline) => {
                 let tli = Arc::new(timeline);
 
@@ -202,7 +191,7 @@ impl GlobalTimelines {
                     .timelines
                     .insert(ttid, tli.clone());
 
-                tli.bootstrap(&conf);
+                tli.bootstrap(&conf, broker_active_set);
 
                 Ok(tli)
             }
@@ -221,6 +210,10 @@ impl GlobalTimelines {
         TIMELINES_STATE.lock().unwrap().get_conf().clone()
     }
 
+    pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
+        TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
+    }
+
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
     pub async fn create(
@@ -229,7 +222,7 @@ impl GlobalTimelines {
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, wal_backup_launcher_tx) = {
+        let (conf, broker_active_set) = {
             let state = TIMELINES_STATE.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
@@ -243,7 +236,6 @@ impl GlobalTimelines {
         let timeline = Arc::new(Timeline::create_empty(
             &conf,
             ttid,
-            wal_backup_launcher_tx,
             server_info,
             commit_lsn,
             local_start_lsn,
@@ -264,7 +256,10 @@ impl GlobalTimelines {
             // Write the new timeline to the disk and start background workers.
             // Bootstrap is transactional, so if it fails, the timeline will be deleted,
             // and the state on disk should remain unchanged.
-            if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
+            if let Err(e) = timeline
+                .init_new(&mut shared_state, &conf, broker_active_set)
+                .await
+            {
                 // Note: the most likely reason for init failure is that the timeline
                 // directory already exists on disk. This happens when timeline is corrupted
                 // and wasn't loaded from disk on startup because of that. We want to preserve
@@ -281,8 +276,6 @@ impl GlobalTimelines {
             // We are done with bootstrap, release the lock, return the timeline.
             // {} block forces release before .await
         }
-        timeline.update_status_notify().await?;
-        timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
         Ok(timeline)
     }
 
@@ -335,12 +328,13 @@ impl GlobalTimelines {
         let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
         match tli_res {
             Ok(timeline) => {
+                let was_active = timeline.broker_active.load(Ordering::Relaxed);
+
                 // Take a lock and finish the deletion holding this mutex.
                 let mut shared_state = timeline.write_shared_state().await;
 
                 info!("deleting timeline {}, only_local={}", ttid, only_local);
-                let (dir_existed, was_active) =
-                    timeline.delete(&mut shared_state, only_local).await?;
+                let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
                 // Remove timeline from the map.
                 // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -349,7 +343,7 @@ impl GlobalTimelines {
 
                 Ok(TimelineDeleteForceResult {
                     dir_existed,
-                    was_active,
+                    was_active, // TODO: we probably should remove this field
                 })
             }
             Err(_) => {
diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs
new file mode 100644
index 0000000000..ea8e23bb72
--- /dev/null
+++ b/safekeeper/src/timelines_set.rs
@@ -0,0 +1,90 @@
+use std::{collections::HashMap, sync::Arc};
+
+use utils::id::TenantTimelineId;
+
+use crate::timeline::Timeline;
+
+/// Set of timelines, supports operations:
+/// - add timeline
+/// - remove timeline
+/// - clone the set
+///
+/// Usually used for keeping subset of timelines. For example active timelines that require broker push.
+pub struct TimelinesSet {
+    timelines: std::sync::Mutex<HashMap<TenantTimelineId, Arc<Timeline>>>,
+}
+
+impl Default for TimelinesSet {
+    fn default() -> Self {
+        Self {
+            timelines: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
+
+impl TimelinesSet {
+    pub fn insert(&self, tli: Arc<Timeline>) {
+        self.timelines.lock().unwrap().insert(tli.ttid, tli);
+    }
+
+    pub fn delete(&self, ttid: &TenantTimelineId) {
+        self.timelines.lock().unwrap().remove(ttid);
+    }
+
+    /// If present is true, adds timeline to the set, otherwise removes it.
+    pub fn set_present(&self, tli: Arc<Timeline>, present: bool) {
+        if present {
+            self.insert(tli);
+        } else {
+            self.delete(&tli.ttid);
+        }
+    }
+
+    pub fn is_present(&self, ttid: &TenantTimelineId) -> bool {
+        self.timelines.lock().unwrap().contains_key(ttid)
+    }
+
+    /// Returns all timelines in the set.
+    pub fn get_all(&self) -> Vec<Arc<Timeline>> {
+        self.timelines.lock().unwrap().values().cloned().collect()
+    }
+
+    /// Returns a timeline guard for easy presence control.
+    pub fn guard(self: &Arc<Self>, tli: Arc<Timeline>) -> TimelineSetGuard {
+        let is_present = self.is_present(&tli.ttid);
+        TimelineSetGuard {
+            timelines_set: self.clone(),
+            tli,
+            is_present,
+        }
+    }
+}
+
+/// Guard is used to add or remove timeline from the set.
+/// If the timeline present in set, it will be removed from it on drop.
+/// Note: do not use more than one guard for the same timeline, it caches the presence state.
+/// It is designed to be used in the manager task only.
+pub struct TimelineSetGuard {
+    timelines_set: Arc<TimelinesSet>,
+    tli: Arc<Timeline>,
+    is_present: bool,
+}
+
+impl TimelineSetGuard {
+    /// Returns true if the state was changed.
+    pub fn set(&mut self, present: bool) -> bool {
+        if present == self.is_present {
+            return false;
+        }
+        self.is_present = present;
+        self.timelines_set.set_present(self.tli.clone(), present);
+        true
+    }
+}
+
+impl Drop for TimelineSetGuard {
+    fn drop(&mut self) {
+        // remove timeline from the map on drop
+        self.timelines_set.delete(&self.tli.ttid);
+    }
+}
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index e496f07114..84680557f9 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -9,7 +9,7 @@ use utils::backoff;
 use utils::id::NodeId;
 
 use std::cmp::min;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -29,9 +29,10 @@ use tracing::*;
 
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
-use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS};
+use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
 use crate::timeline::{PeerInfo, Timeline};
-use crate::{GlobalTimelines, SafeKeeperConf};
+use crate::timeline_manager::StateSnapshot;
+use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
 
 use once_cell::sync::OnceCell;
 
@@ -41,35 +42,84 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 const BUFFER_SIZE: usize = 32 * 1024;
 
-/// Check whether wal backup is required for timeline. If yes, mark that launcher is
-/// aware of current status and return the timeline.
-async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
-    match GlobalTimelines::get(ttid).ok() {
-        Some(tli) => {
-            tli.wal_backup_attend().await;
-            Some(tli)
-        }
-        None => None,
-    }
-}
-
-struct WalBackupTaskHandle {
+pub struct WalBackupTaskHandle {
     shutdown_tx: Sender<()>,
     handle: JoinHandle<()>,
 }
 
-struct WalBackupTimelineEntry {
-    timeline: Arc<Timeline>,
-    handle: Option<WalBackupTaskHandle>,
+/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
+pub fn is_wal_backup_required(
+    wal_seg_size: usize,
+    num_computes: usize,
+    state: &StateSnapshot,
+) -> bool {
+    num_computes > 0 ||
+    // Currently only the whole segment is offloaded, so compare segment numbers.
+    (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size))
 }
 
-async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
-    if let Some(wb_handle) = entry.handle.take() {
+/// Based on peer information determine which safekeeper should offload; if it
+/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
+/// is running, kill it.
+pub async fn update_task(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    need_backup: bool,
+    state: &StateSnapshot,
+    entry: &mut Option<WalBackupTaskHandle>,
+) {
+    let (offloader, election_dbg_str) =
+        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
+    let elected_me = Some(conf.my_id) == offloader;
+
+    let should_task_run = need_backup && elected_me;
+
+    // start or stop the task
+    if should_task_run != (entry.is_some()) {
+        if should_task_run {
+            info!("elected for backup: {}", election_dbg_str);
+
+            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
+            let timeline_dir = conf.timeline_dir(&ttid);
+
+            let async_task = backup_task_main(
+                ttid,
+                timeline_dir,
+                conf.workdir.clone(),
+                conf.backup_parallel_jobs,
+                shutdown_rx,
+            );
+
+            let handle = if conf.current_thread_runtime {
+                tokio::spawn(async_task)
+            } else {
+                WAL_BACKUP_RUNTIME.spawn(async_task)
+            };
+
+            *entry = Some(WalBackupTaskHandle {
+                shutdown_tx,
+                handle,
+            });
+        } else {
+            if !need_backup {
+                // don't need backup at all
+                info!("stepping down from backup, need_backup={}", need_backup);
+            } else {
+                // someone else has been elected
+                info!("stepping down from backup: {}", election_dbg_str);
+            }
+            shut_down_task(entry).await;
+        }
+    }
+}
+
+async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
+    if let Some(wb_handle) = entry.take() {
         // Tell the task to shutdown. Error means task exited earlier, that's ok.
         let _ = wb_handle.shutdown_tx.send(()).await;
         // Await the task itself. TODO: restart panicked tasks earlier.
         if let Err(e) = wb_handle.handle.await {
-            warn!("WAL backup task for {} panicked: {}", ttid, e);
+            warn!("WAL backup task panicked: {}", e);
         }
     }
 }
@@ -126,49 +176,6 @@ fn determine_offloader(
     }
 }
 
-/// Based on peer information determine which safekeeper should offload; if it
-/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
-/// is running, kill it.
-async fn update_task(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    entry: &mut WalBackupTimelineEntry,
-) {
-    let alive_peers = entry.timeline.get_peers(conf).await;
-    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
-    let (offloader, election_dbg_str) =
-        determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
-    let elected_me = Some(conf.my_id) == offloader;
-
-    if elected_me != (entry.handle.is_some()) {
-        if elected_me {
-            info!("elected for backup: {}", election_dbg_str);
-
-            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);
-
-            let handle = tokio::spawn(
-                backup_task_main(
-                    ttid,
-                    timeline_dir,
-                    conf.workdir.clone(),
-                    conf.backup_parallel_jobs,
-                    shutdown_rx,
-                )
-                .in_current_span(),
-            );
-
-            entry.handle = Some(WalBackupTaskHandle {
-                shutdown_tx,
-                handle,
-            });
-        } else {
-            info!("stepping down from backup: {}", election_dbg_str);
-            shut_down_task(ttid, entry).await;
-        }
-    }
-}
-
 static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
 
 // Storage must be configured and initialized when this is called.
@@ -190,67 +197,6 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
     });
 }
 
-const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
-
-/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
-/// tasks. Having this in separate task simplifies locking, allows to reap
-/// panics and separate elections from offloading itself.
-pub async fn wal_backup_launcher_task_main(
-    conf: SafeKeeperConf,
-    mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
-) -> anyhow::Result<()> {
-    info!(
-        "WAL backup launcher started, remote config {:?}",
-        conf.remote_storage
-    );
-
-    // Presence in this map means launcher is aware s3 offloading is needed for
-    // the timeline, but task is started only if it makes sense for to offload
-    // from this safekeeper.
-    let mut tasks: HashMap<TenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
-
-    let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC));
-    loop {
-        tokio::select! {
-            ttid = wal_backup_launcher_rx.recv() => {
-                // channel is never expected to get closed
-                let ttid = ttid.unwrap();
-                if !conf.is_wal_backup_enabled() {
-                    continue; /* just drain the channel and do nothing */
-                }
-                async {
-                    let timeline = is_wal_backup_required(ttid).await;
-                    // do we need to do anything at all?
-                    if timeline.is_some() != tasks.contains_key(&ttid) {
-                        if let Some(timeline) = timeline {
-                            // need to start the task
-                            let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
-                                timeline,
-                                handle: None,
-                            });
-                            update_task(&conf, ttid, entry).await;
-                        } else {
-                            // need to stop the task
-                            info!("stopping WAL backup task");
-                            let mut entry = tasks.remove(&ttid).unwrap();
-                            shut_down_task(ttid, &mut entry).await;
-                        }
-                    }
-                }.instrument(info_span!("WAL backup", ttid = %ttid)).await;
-            }
-            // For each timeline needing offloading, check if this safekeeper
-            // should do the job and start/stop the task accordingly.
-            _ = ticker.tick() => {
-                for (ttid, entry) in tasks.iter_mut() {
-                    update_task(&conf, *ttid, entry)
-                        .instrument(info_span!("WAL backup", ttid = %ttid))
-                        .await;
-                }
-            }
-        }
-    }
-}
-
 struct WalBackupTask {
     timeline: Arc<Timeline>,
     timeline_dir: Utf8PathBuf,
@@ -261,6 +207,7 @@ struct WalBackupTask {
 }
 
 /// Offload single timeline.
+#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
 async fn backup_task_main(
     ttid: TenantTimelineId,
     timeline_dir: Utf8PathBuf,
@@ -268,6 +215,8 @@ async fn backup_task_main(
     parallel_jobs: usize,
     mut shutdown_rx: Receiver<()>,
 ) {
+    let _guard = WAL_BACKUP_TASKS.guard();
+
     info!("started");
     let res = GlobalTimelines::get(ttid);
     if let Err(e) = res {
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 200096ac5c..29e944bff3 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -277,14 +277,6 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
 
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
     // sleep for random time to avoid thundering herd
     {
         let randf64 = rand::thread_rng().gen_range(0.0..1.0);
@@ -327,7 +319,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
                 && flush_lsn_rx.borrow().term == seg.term
             {
                 tokio::select! {
-                    _ = cancellation_rx.changed() => {
+                    _ = backup.tli.cancel.cancelled() => {
                         info!("timeline canceled");
                         return;
                     }
@@ -340,7 +332,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         // if we don't have any data and zero LSNs, wait for something
         while flush_lsn_rx.borrow().lsn == Lsn(0) {
             tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
                     return;
                 }
@@ -357,7 +349,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
         // waiting until timeout expires OR segno changes
         'inner: loop {
             tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                     info!("timeline canceled");
                     return;
                 }
diff --git a/scripts/check_allowed_errors.sh b/scripts/check_allowed_errors.sh
new file mode 100755
index 0000000000..87e52c1e64
--- /dev/null
+++ b/scripts/check_allowed_errors.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -eu
+
+HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")"
+SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py"
+
+# first run to understand all of the errors:
+#
+# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log
+# example: ./scripts/check_allowed_errors.sh -i pageserver.log
+#
+# then edit the test local allowed_errors to the
+# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure
+# they are handled.
+#
+# finally revert any local changes to allowed_errors.py.
+poetry run python3 "$HELPER_DIR/../$SCRIPT" $*
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 878840fcee..919a9278a9 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -5,10 +5,11 @@ import json
 import logging
 import os
 from collections import defaultdict
-from typing import DefaultDict, Dict
+from typing import Any, DefaultDict, Dict, Optional
 
 import psycopg2
 import psycopg2.extras
+import toml
 
 FLAKY_TESTS_QUERY = """
     SELECT
@@ -58,6 +59,24 @@ def main(args: argparse.Namespace):
     else:
         pageserver_virtual_file_io_engine_parameter = ""
 
+    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
+    def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+        """Duplicated from parametrize.py"""
+        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+        if toml_table is None:
+            return None
+        v = toml.loads(toml_table)
+        assert isinstance(v, dict)
+        return v
+
+    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        pageserver_default_tenant_config_compaction_algorithm_parameter = (
+            f"-{explicit_default['kind']}"
+        )
+
     for row in rows:
         # We don't want to automatically rerun tests in a performance suite
         if row["parent_suite"] != "test_runner.regress":
@@ -66,10 +85,10 @@ def main(args: argparse.Namespace):
         if row["name"].endswith("]"):
             parametrized_test = row["name"].replace(
                 "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}-",
+                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
             )
         else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}]"
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"
 
         res[row["parent_suite"]][row["suite"]][parametrized_test] = True
 
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index d66cbefa45..1a6fb7fedf 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -147,6 +147,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                 http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
                 local_start_lsn: 0,
                 availability_zone: None,
+                standby_horizon: 0,
             };
             counter += 1;
             yield info;
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index 7d1b63d23f..a420fd9c66 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -42,6 +42,7 @@ message SafekeeperTimelineInfo {
     uint64 remote_consistent_lsn = 7;
     uint64 peer_horizon_lsn = 8;
     uint64 local_start_lsn = 9;
+    uint64 standby_horizon = 14;
     // A connection string to use for WAL receiving.
     string safekeeper_connstr = 10;
     // HTTP endpoint connection string
@@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse {
     string safekeeper_connstr = 4;
     // Availability zone of a safekeeper.
     optional string availability_zone = 5;
+    // Replica apply LSN
+    uint64 standby_horizon = 6;
 }
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 8c88b61abc..0a4af543ab 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -736,6 +736,7 @@ mod tests {
             http_connstr: "neon-1-sk-1.local:7677".to_owned(),
             local_start_lsn: 0,
             availability_zone: None,
+            standby_horizon: 0,
         })
     }
 
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index f1454af533..ce8f8d0cdd 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -66,6 +66,10 @@ struct Cli {
     #[arg(long)]
     max_unavailable_interval: Option<humantime::Duration>,
 
+    /// Size threshold for automatically splitting shards (disabled by default)
+    #[arg(long)]
+    split_threshold: Option<u64>,
+
     /// Maximum number of reconcilers that may run in parallel
     #[arg(long)]
     reconciler_concurrency: Option<usize>,
@@ -255,6 +259,7 @@ async fn async_main() -> anyhow::Result<()> {
         reconciler_concurrency: args
             .reconciler_concurrency
             .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
+        split_threshold: args.split_threshold,
     };
 
     // After loading secrets & config, but before starting anything else, apply database migrations
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index 25b6b67e12..769aba80ca 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -2,7 +2,7 @@ use pageserver_api::{
     models::{
         LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
         TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
     },
     shard::TenantShardId,
 };
@@ -234,4 +234,16 @@ impl PageserverClient {
             self.inner.get_utilization().await
         )
     }
+
+    pub(crate) async fn top_tenant_shards(
+        &self,
+        request: TopTenantShardsRequest,
+    ) -> Result<TopTenantShardsResponse> {
+        measured_request!(
+            "top_tenants",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.top_tenant_shards(request).await
+        )
+    }
 }
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index dca37166ba..67c05296d5 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -173,7 +173,7 @@ impl Persistence {
     /// Wraps `with_conn` in order to collect latency and error metrics
     async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
         let latency = &METRICS_REGISTRY
@@ -199,13 +199,48 @@ impl Persistence {
     /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
     async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
     where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
         R: Send + 'static,
     {
+        // A generous allowance for how many times we may retry serializable transactions
+        // before giving up.  This is not expected to be hit: it is a defensive measure in case we
+        // somehow engineer a situation where duelling transactions might otherwise live-lock.
+        const MAX_RETRIES: usize = 128;
+
         let mut conn = self.connection_pool.get()?;
-        tokio::task::spawn_blocking(move || -> DatabaseResult<R> { func(&mut conn) })
-            .await
-            .expect("Task panic")
+        tokio::task::spawn_blocking(move || -> DatabaseResult<R> {
+            let mut retry_count = 0;
+            loop {
+                match conn.build_transaction().serializable().run(|c| func(c)) {
+                    Ok(r) => break Ok(r),
+                    Err(
+                        err @ DatabaseError::Query(diesel::result::Error::DatabaseError(
+                            diesel::result::DatabaseErrorKind::SerializationFailure,
+                            _,
+                        )),
+                    ) => {
+                        retry_count += 1;
+                        if retry_count > MAX_RETRIES {
+                            tracing::error!(
+                                "Exceeded max retries on SerializationFailure errors: {err:?}"
+                            );
+                            break Err(err);
+                        } else {
+                            // Retry on serialization errors: these are expected, because even though our
+                            // transactions don't fight for the same rows, they will occasionally collide
+                            // on index pages (e.g. increment_generation for unrelated shards can collide)
+                            tracing::debug!(
+                                "Retrying transaction on serialization failure {err:?}"
+                            );
+                            continue;
+                        }
+                    }
+                    Err(e) => break Err(e),
+                }
+            }
+        })
+        .await
+        .expect("Task panic")
     }
 
     /// When a node is first registered, persist it before using it for anything
@@ -358,14 +393,11 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::InsertTenantShards,
             move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                 Ok(())
             },
         )
@@ -533,8 +565,11 @@ impl Persistence {
             let update = ShardUpdate {
                 generation: input_generation.map(|g| g.into().unwrap() as i32),
                 placement_policy: input_placement_policy
+                    .as_ref()
                     .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
+                config: input_config
+                    .as_ref()
+                    .map(|c| serde_json::to_string(&c).unwrap()),
                 scheduling_policy: input_scheduling_policy
                     .map(|p| serde_json::to_string(&p).unwrap()),
             };
@@ -581,55 +616,51 @@ impl Persistence {
     ) -> DatabaseResult<()> {
         use crate::schema::tenant_shards::dsl::*;
         self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
-            conn.transaction(|conn| -> DatabaseResult<()> {
-                // Mark parent shards as splitting
+            // Mark parent shards as splitting
 
-                let updated = diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                    .set((splitting.eq(1),))
-                    .execute(conn)?;
-                if u8::try_from(updated)
-                    .map_err(|_| DatabaseError::Logical(
-                        format!("Overflow existing shard count {} while splitting", updated))
-                    )? != old_shard_count.count() {
-                    // Perhaps a deletion or another split raced with this attempt to split, mutating
-                    // the parent shards that we intend to split. In this case the split request should fail.
-                    return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
-                    ));
+            let updated = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(split_tenant_id.to_string()))
+                .filter(shard_count.eq(old_shard_count.literal() as i32))
+                .set((splitting.eq(1),))
+                .execute(conn)?;
+            if u8::try_from(updated)
+                .map_err(|_| DatabaseError::Logical(
+                    format!("Overflow existing shard count {} while splitting", updated))
+                )? != old_shard_count.count() {
+                // Perhaps a deletion or another split raced with this attempt to split, mutating
+                // the parent shards that we intend to split. In this case the split request should fail.
+                return Err(DatabaseError::Logical(
+                    format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
+                ));
+            }
+
+            // FIXME: spurious clone to sidestep closure move rules
+            let parent_to_children = parent_to_children.clone();
+
+            // Insert child shards
+            for (parent_shard_id, children) in parent_to_children {
+                let mut parent = crate::schema::tenant_shards::table
+                    .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
+                    .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
+                    .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
+                    .load::<TenantShardPersistence>(conn)?;
+                let parent = if parent.len() != 1 {
+                    return Err(DatabaseError::Logical(format!(
+                        "Parent shard {parent_shard_id} not found"
+                    )));
+                } else {
+                    parent.pop().unwrap()
+                };
+                for mut shard in children {
+                    // Carry the parent's generation into the child
+                    shard.generation = parent.generation;
+
+                    debug_assert!(shard.splitting == SplitState::Splitting);
+                    diesel::insert_into(tenant_shards)
+                        .values(shard)
+                        .execute(conn)?;
                 }
-
-                // FIXME: spurious clone to sidestep closure move rules
-                let parent_to_children = parent_to_children.clone();
-
-                // Insert child shards
-                for (parent_shard_id, children) in parent_to_children {
-                    let mut parent = crate::schema::tenant_shards::table
-                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
-                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
-                        .load::<TenantShardPersistence>(conn)?;
-                    let parent = if parent.len() != 1 {
-                        return Err(DatabaseError::Logical(format!(
-                            "Parent shard {parent_shard_id} not found"
-                        )));
-                    } else {
-                        parent.pop().unwrap()
-                    };
-                    for mut shard in children {
-                        // Carry the parent's generation into the child
-                        shard.generation = parent.generation;
-
-                        debug_assert!(shard.splitting == SplitState::Splitting);
-                        diesel::insert_into(tenant_shards)
-                            .values(shard)
-                            .execute(conn)?;
-                    }
-                }
-
-                Ok(())
-            })?;
+            }
 
             Ok(())
         })
@@ -647,22 +678,18 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::CompleteShardSplit,
             move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;
 
-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);
 
                 Ok(())
             },
@@ -681,39 +708,34 @@ impl Persistence {
         self.with_measured_conn(
             DatabaseOperation::AbortShardSplit,
             move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
+                // Clear the splitting state on parent shards
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.ne(new_shard_count.literal() as i32))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
 
-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
+                // Parent shards are already gone: we cannot abort.
+                if updated == 0 {
+                    return Ok(AbortShardSplitStatus::Complete);
+                }
 
-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
+                // Sanity check: if parent shards were present, their cardinality should
+                // be less than the number of child shards.
+                if updated >= new_shard_count.count() as usize {
+                    return Err(DatabaseError::Logical(format!(
+                        "Unexpected parent shard count {updated} while aborting split to \
                             count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
+                    )));
+                }
 
-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
+                // Erase child shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(new_shard_count.literal() as i32))
+                    .execute(conn)?;
 
-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
-
-                Ok(aborted)
+                Ok(AbortShardSplitStatus::Aborted)
             },
         )
         .await
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d3a53066c9..f914f4e0bb 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,10 +32,10 @@ use pageserver_api::{
         TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
         UtilizationScore,
     },
-    models::{SecondaryProgress, TenantConfigRequest},
+    models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
 use reqwest::StatusCode;
-use tracing::instrument;
+use tracing::{instrument, Instrument};
 
 use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
@@ -222,6 +222,10 @@ pub struct Config {
 
     /// How many Reconcilers may be spawned concurrently
     pub reconciler_concurrency: usize,
+
+    /// How large must a shard grow in bytes before we split it?
+    /// None disables auto-splitting.
+    pub split_threshold: Option<u64>,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -699,7 +703,7 @@ impl Service {
     /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
     /// for those retries.
     #[instrument(skip_all)]
-    async fn background_reconcile(&self) {
+    async fn background_reconcile(self: &Arc<Self>) {
         self.startup_complete.clone().wait().await;
 
         const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
@@ -711,7 +715,11 @@ impl Service {
                 let reconciles_spawned = self.reconcile_all();
                 if reconciles_spawned == 0 {
                     // Run optimizer only when we didn't find any other work to do
-                    self.optimize_all().await;
+                    let optimizations = self.optimize_all().await;
+                    if optimizations == 0 {
+                        // Run new splits only when no optimizations are pending
+                        self.autosplit_tenants().await;
+                    }
                 }
             }
               _ = self.cancel.cancelled() => return
@@ -4745,7 +4753,7 @@ impl Service {
                     // them in an optimization
                     const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024;
 
-                    if progress.bytes_total == 0
+                    if progress.heatmap_mtime.is_none()
                         || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD
                             && progress.bytes_downloaded != progress.bytes_total
                         || progress.bytes_total - progress.bytes_downloaded
@@ -4766,6 +4774,104 @@ impl Service {
         validated_work
     }
 
+    /// Look for shards which are oversized and in need of splitting
+    async fn autosplit_tenants(self: &Arc<Self>) {
+        let Some(split_threshold) = self.config.split_threshold else {
+            // Auto-splitting is disabled
+            return;
+        };
+
+        let nodes = self.inner.read().unwrap().nodes.clone();
+
+        const SPLIT_TO_MAX: ShardCount = ShardCount::new(8);
+
+        let mut top_n = Vec::new();
+
+        // Call into each node to look for big tenants
+        let top_n_request = TopTenantShardsRequest {
+            // We currently split based on logical size, for simplicity: logical size is a signal of
+            // the user's intent to run a large database, whereas physical/resident size can be symptoms
+            // of compaction issues.  Eventually we should switch to using resident size to bound the
+            // disk space impact of one shard.
+            order_by: models::TenantSorting::MaxLogicalSize,
+            limit: 10,
+            where_shards_lt: Some(SPLIT_TO_MAX),
+            where_gt: Some(split_threshold),
+        };
+        for node in nodes.values() {
+            let request_ref = &top_n_request;
+            match node
+                .with_client_retries(
+                    |client| async move {
+                        let request = request_ref.clone();
+                        client.top_tenant_shards(request.clone()).await
+                    },
+                    &self.config.jwt_token,
+                    3,
+                    3,
+                    Duration::from_secs(5),
+                    &self.cancel,
+                )
+                .await
+            {
+                Some(Ok(node_top_n)) => {
+                    top_n.extend(node_top_n.shards.into_iter());
+                }
+                Some(Err(mgmt_api::Error::Cancelled)) => {
+                    continue;
+                }
+                Some(Err(e)) => {
+                    tracing::warn!("Failed to fetch top N tenants from {node}: {e}");
+                    continue;
+                }
+                None => {
+                    // Node is shutting down
+                    continue;
+                }
+            };
+        }
+
+        // Pick the biggest tenant to split first
+        top_n.sort_by_key(|i| i.resident_size);
+        let Some(split_candidate) = top_n.into_iter().next() else {
+            tracing::debug!("No split-elegible shards found");
+            return;
+        };
+
+        // We spawn a task to run this, so it's exactly like some external API client requesting it.  We don't
+        // want to block the background reconcile loop on this.
+        tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}");
+
+        let this = self.clone();
+        tokio::spawn(
+            async move {
+                match this
+                    .tenant_shard_split(
+                        split_candidate.id.tenant_id,
+                        TenantShardSplitRequest {
+                            // Always split to the max number of shards: this avoids stepping through
+                            // intervening shard counts and encountering the overrhead of a split+cleanup
+                            // each time as a tenant grows, and is not too expensive because our max shard
+                            // count is relatively low anyway.
+                            // This policy will be adjusted in future once we support higher shard count.
+                            new_shard_count: SPLIT_TO_MAX.literal(),
+                            new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE),
+                        },
+                    )
+                    .await
+                {
+                    Ok(_) => {
+                        tracing::info!("Successful auto-split");
+                    }
+                    Err(e) => {
+                        tracing::error!("Auto-split failed: {e}");
+                    }
+                }
+            }
+            .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)),
+        );
+    }
+
     /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but
     /// also wait for any generated Reconcilers to complete.  Calling this until it returns zero should
     /// put the system into a quiescent state where future background reconciliations won't do anything.
diff --git a/test_runner/README.md b/test_runner/README.md
index 051897744a..fd68cfff79 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -92,6 +92,166 @@ Exit after the first test failure:
 `./scripts/pytest -x ...`
 (there are many more pytest options; run `pytest -h` to see them.)
 
+#### Running Python tests against real S3 or S3-compatible services
+
+Neon's `libs/remote_storage` supports multiple implementations of remote storage.
+At the time of writing, that is
+```rust
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored files into.
+    LocalFs(Utf8PathBuf),
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
+    AwsS3(S3Config),
+    /// Azure Blob based storage, storing all files in the container
+    /// specified by the config
+    AzureContainer(AzureConfig),
+}
+```
+
+The test suite has a Python enum with equal name but different meaning:
+
+```python
+@enum.unique
+class RemoteStorageKind(str, enum.Enum):
+    LOCAL_FS = "local_fs"
+    MOCK_S3 = "mock_s3"
+    REAL_S3 = "real_s3"
+```
+
+* `LOCAL_FS` => `LocalFs`
+* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3`
+* `REAL_S3` => configure `AwsS3` as detailed below
+
+When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`.
+That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`:
+* If it is not set, use `MOCK_S3`
+* If it is set, use `REAL_S3`.
+
+For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars:
+
+```rust
+pub struct S3Config {
+    // test suite env var: REMOTE_STORAGE_S3_BUCKET
+    pub bucket_name: String,
+    // test suite env var: REMOTE_STORAGE_S3_REGION
+    pub bucket_region: String,
+    // test suite determines this
+    pub prefix_in_bucket: Option<String>,
+    // no env var exists; test suite sets it for MOCK_S3, because that's how moto works
+    pub endpoint: Option<String>,
+    ...
+}
+```
+
+*Credentials* are not part of the config, but discovered by the AWS SDK.
+See the `libs/remote_storage` Rust code.
+We're documenting two mechanism here:
+
+The test suite supports two mechanisms (`remote_storage.py`):
+
+**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
+Populate the env vars with AWS access keys that you created in IAM.
+Our CI uses this mechanism.
+However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)).
+Instead, use profiles (next section).
+
+**Credential mechanism 2**: env var `AWS_PROFILE`.
+This uses the AWS SDK's (and CLI's) profile mechanism.
+Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html).
+After configuring a profile (e.g. via the aws CLI), set the env var to its name.
+
+In conclusion, the full command line is:
+
+```bash
+# with long-term AWS access keys
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=eu-central-1 \
+AWS_ACCESS_KEY_ID=... \
+AWS_SECRET_ACCESS_KEY=... \
+./scripts/pytest
+```
+<!-- Don't forget to update the Minio example when changing these -->
+```bash
+# with AWS PROFILE
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=eu-central-1 \
+AWS_PROFILE=... \
+./scripts/pytest
+```
+
+If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first.
+
+##### Minio
+
+If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html).
+
+```bash
+# Start in Terminal 1
+mkdir /tmp/minio_data
+minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000
+```
+
+In another terminal, create an `aws` CLI profile for it:
+
+```ini
+# append to ~/.aws/config
+[profile local-minio]
+services = local-minio-services
+[services local-minio-services]
+s3 =
+  endpoint_url=http://127.0.0.1:9000/
+```
+
+
+Now configure the credentials (this is going to write `~/.aws/credentials` for you).
+It's an interactive prompt.
+
+```bash
+# Terminal 2
+$ aws --profile local-minio configure
+AWS Access Key ID [None]: minioadmin
+AWS Secret Access Key [None]: minioadmin
+Default region name [None]:
+Default output format [None]:
+```
+
+Now create a bucket `testbucket` using the CLI.
+
+```bash
+# (don't forget to have AWS_PROFILE env var set; or use --profile)
+aws --profile local-minio s3 mb s3://mybucket
+```
+
+(If it doesn't work, make sure you update your AWS CLI to a recent version.
+ The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html)
+ that we're using is quite new.)
+
+```bash
+# with AWS PROFILE
+ENABLE_REAL_S3_REMOTE_STORAGE=true \
+REMOTE_STORAGE_S3_BUCKET=mybucket \
+REMOTE_STORAGE_S3_REGION=doesntmatterforminio \
+AWS_PROFILE=local-minio \
+./scripts/pytest
+```
+
+NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable.
+Just like the AWS SDKs, the `aws` CLI is sensible to it.
+
+#### Running Rust tests against real S3 or S3-compatible services
+
+We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397).
+
+They use the same env vars as the Python test suite (see previous section)
+but interpret them on their own.
+However, at this time, the interpretation is identical.
+
+So, above instructions apply to the Rust test as well.
+
 ### Writing a test
 
 Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment
diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index c32748f6f0..038f557cc8 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -19,9 +19,9 @@ from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from _pytest.terminal import TerminalReporter
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
-from fixtures.types import TenantId, TimelineId
 
 """
 This file contains fixtures for micro-benchmarks.
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/common_types.py
similarity index 100%
rename from test_runner/fixtures/types.py
rename to test_runner/fixtures/common_types.py
diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py
index a883d94f73..66fc35b6aa 100644
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -5,8 +5,8 @@ import pytest
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+from fixtures.common_types import TenantId
 from fixtures.log_helper import log
-from fixtures.types import TenantId
 
 
 class ComputeReconfigure:
diff --git a/test_runner/fixtures/endpoint/__init__.py b/test_runner/fixtures/endpoint/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
new file mode 100644
index 0000000000..42f0539c19
--- /dev/null
+++ b/test_runner/fixtures/endpoint/http.py
@@ -0,0 +1,23 @@
+import requests
+from requests.adapters import HTTPAdapter
+
+
+class EndpointHttpClient(requests.Session):
+    def __init__(
+        self,
+        port: int,
+    ):
+        super().__init__()
+        self.port = port
+
+        self.mount("http://", HTTPAdapter())
+
+    def dbs_and_roles(self):
+        res = self.get(f"http://localhost:{self.port}/dbs_and_roles")
+        res.raise_for_status()
+        return res.json()
+
+    def database_schema(self, database: str):
+        res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
+        res.raise_for_status()
+        return res.text
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 7d34e12ca3..8b8075f8c1 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -142,6 +142,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_resident_physical_size",
     "pageserver_io_operations_bytes_total",
     "pageserver_last_record_lsn",
+    "pageserver_standby_horizon",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
@@ -149,6 +150,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",
     "pageserver_evictions_with_low_residence_duration_total",
+    "pageserver_aux_file_estimated_size",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 30cec4c726..796ae7217b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,7 +14,7 @@ import textwrap
 import threading
 import time
 import uuid
-from contextlib import ExitStack, closing, contextmanager
+from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
@@ -47,17 +47,20 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
     DEFAULT_PAGESERVER_ALLOWED_ERRORS,
     DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
+from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pageserver.types import IndexPartDump, LayerFileName, parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -72,13 +75,13 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
     allure_add_grafana_links,
     allure_attach_from_dir,
     assert_no_errors,
     get_self_dir,
+    print_gc_result,
     subprocess_capture,
     wait_until,
 )
@@ -452,7 +455,7 @@ class NeonEnvBuilder:
         test_overlay_dir: Optional[Path] = None,
         pageserver_remote_storage: Optional[RemoteStorage] = None,
         # toml that will be decomposed into `--config-override` flags during `pageserver --init`
-        pageserver_config_override: Optional[str] = None,
+        pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None,
         num_safekeepers: int = 1,
         num_pageservers: int = 1,
         # Use non-standard SK ids to check for various parsing bugs
@@ -467,6 +470,7 @@ class NeonEnvBuilder:
         initial_timeline: Optional[TimelineId] = None,
         pageserver_virtual_file_io_engine: Optional[str] = None,
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+        pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -507,6 +511,14 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
+        self.pageserver_default_tenant_config_compaction_algorithm: Optional[
+            Dict[str, Any]
+        ] = pageserver_default_tenant_config_compaction_algorithm
+        if self.pageserver_default_tenant_config_compaction_algorithm is not None:
+            log.debug(
+                f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
+            )
+
         self.pageserver_get_vectored_impl: Optional[str] = None
         if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored":
             self.pageserver_get_vectored_impl = "vectored"
@@ -701,6 +713,10 @@ class NeonEnvBuilder:
         config["default_tenant_id"] = snapshot_config["default_tenant_id"]
         config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]
 
+        # Update the config with new neon + postgres path in case of compat test
+        config["pg_distrib_dir"] = str(self.pg_distrib_dir)
+        config["neon_distrib_dir"] = str(self.neon_binpath)
+
         with (self.repo_dir / "config").open("w") as f:
             toml.dump(config, f)
 
@@ -1054,14 +1070,14 @@ class NeonEnv:
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
 
-        # Create a config file corresponding to the options
+        # Create the neon_local's `NeonLocalInitConf`
         cfg: Dict[str, Any] = {
             "default_tenant_id": str(self.initial_tenant),
             "broker": {
                 "listen_addr": self.broker.listen_addr(),
             },
-            "pageservers": [],
             "safekeepers": [],
+            "pageservers": [],
         }
 
         if self.control_plane_api is not None:
@@ -1099,6 +1115,26 @@ class NeonEnv:
                 ps_cfg["get_impl"] = config.pageserver_get_impl
             if config.pageserver_validate_vectored_get is not None:
                 ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get
+            if config.pageserver_default_tenant_config_compaction_algorithm is not None:
+                tenant_config = ps_cfg.setdefault("tenant_config", {})
+                tenant_config[
+                    "compaction_algorithm"
+                ] = config.pageserver_default_tenant_config_compaction_algorithm
+
+            if self.pageserver_remote_storage is not None:
+                ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
+                    self.pageserver_remote_storage
+                )
+
+            if config.pageserver_config_override is not None:
+                if callable(config.pageserver_config_override):
+                    config.pageserver_config_override(ps_cfg)
+                else:
+                    assert isinstance(config.pageserver_config_override, str)
+                    for o in config.pageserver_config_override.split(";"):
+                        override = toml.loads(o)
+                        for key, value in override.items():
+                            ps_cfg[key] = value
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
@@ -1136,7 +1172,6 @@ class NeonEnv:
         self.neon_cli.init(
             cfg,
             force=config.config_init_force,
-            pageserver_config_override=config.pageserver_config_override,
         )
 
     def start(self):
@@ -1290,6 +1325,7 @@ def _shared_simple_env(
     pg_version: PgVersion,
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore],
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
 ) -> Iterator[NeonEnv]:
     """
     # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1321,6 +1357,7 @@ def _shared_simple_env(
         test_output_dir=test_output_dir,
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
     ) as builder:
         env = builder.init_start()
 
@@ -1360,7 +1397,8 @@ def neon_env_builder(
     test_overlay_dir: Path,
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
-    pageserver_aux_file_policy: Optional[AuxFileStore] = None,
+    pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
+    pageserver_aux_file_policy: Optional[AuxFileStore],
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1395,6 +1433,7 @@ def neon_env_builder(
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
+        pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
     ) as builder:
         yield builder
 
@@ -1586,7 +1625,7 @@ class NeonCli(AbstractNeonCli):
             args.extend(["-c", "switch_aux_file_policy:v1"])
 
         if aux_file_v2 is AuxFileStore.CrossValidation:
-            args.extend(["-c", "switch_aux_file_policy:cross_validation"])
+            args.extend(["-c", "switch_aux_file_policy:cross-validation"])
 
         if set_default:
             args.append("--set-default")
@@ -1722,46 +1761,22 @@ class NeonCli(AbstractNeonCli):
 
     def init(
         self,
-        config: Dict[str, Any],
+        init_config: Dict[str, Any],
         force: Optional[str] = None,
-        pageserver_config_override: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
-        remote_storage = self.env.pageserver_remote_storage
-
-        ps_config = {}
-        if remote_storage is not None:
-            ps_config["remote_storage"] = remote_storage_to_toml_dict(remote_storage)
-
-        if pageserver_config_override is not None:
-            for o in pageserver_config_override.split(";"):
-                override = toml.loads(o)
-                for key, value in override.items():
-                    ps_config[key] = value
-
-        with ExitStack() as stack:
-            ps_config_file = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
-            ps_config_file.write(toml.dumps(ps_config))
-            ps_config_file.flush()
-
-            neon_local_config = stack.enter_context(tempfile.NamedTemporaryFile(mode="w+"))
-            neon_local_config.write(toml.dumps(config))
-            neon_local_config.flush()
+        with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile:
+            init_config_tmpfile.write(toml.dumps(init_config))
+            init_config_tmpfile.flush()
 
             cmd = [
                 "init",
-                f"--config={neon_local_config.name}",
-                "--pg-version",
-                self.env.pg_version,
-                f"--pageserver-config={ps_config_file.name}",
+                f"--config={init_config_tmpfile.name}",
             ]
 
             if force is not None:
                 cmd.extend(["--force", force])
 
-            s3_env_vars = None
-            if isinstance(remote_storage, S3Storage):
-                s3_env_vars = remote_storage.access_env_vars()
-            res = self.raw_cli(cmd, extra_env_vars=s3_env_vars)
+            res = self.raw_cli(cmd)
             res.check_returncode()
         return res
 
@@ -2678,7 +2693,7 @@ class NeonPageserver(PgProtocol, LogUtils):
         )
 
     def layer_exists(
-        self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerFileName
+        self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName
     ) -> bool:
         layers = self.list_layers(tenant_id, timeline_id)
         return layer_name in [parse_layer_file_name(p.name) for p in layers]
@@ -2706,7 +2721,12 @@ class PgBin:
         env.update(env_add)
         return env
 
-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ):
         """
         Run one of the postgres binaries.
 
@@ -2768,6 +2788,28 @@ class PgBin:
         log.info(f"last checkpoint at {checkpoint_lsn}")
         return Lsn(checkpoint_lsn)
 
+    def take_fullbackup(
+        self,
+        pageserver: NeonPageserver,
+        tenant: TenantId,
+        timeline: TimelineId,
+        lsn: Lsn,
+        output: Path,
+    ):
+        """
+        Request fullbackup from pageserver, store it at 'output'.
+        """
+        cmd = [
+            "psql",
+            "--no-psqlrc",
+            pageserver.connstr(),
+            "-c",
+            f"fullbackup {tenant} {timeline} {lsn}",
+            "-o",
+            str(output),
+        ]
+        self.run_capture(cmd)
+
 
 @pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
@@ -3363,6 +3405,13 @@ class Endpoint(PgProtocol):
         self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
+    def http_client(
+        self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
+    ) -> EndpointHttpClient:
+        return EndpointHttpClient(
+            port=self.http_port,
+        )
+
     def create(
         self,
         branch_name: str,
@@ -4123,7 +4172,12 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 
 
 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+def check_restored_datadir_content(
+    test_output_dir: Path,
+    env: NeonEnv,
+    endpoint: Endpoint,
+    ignored_files: Optional[list[str]] = None,
+):
     pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
 
     # Get the timeline ID. We need it for the 'basebackup' command
@@ -4176,6 +4230,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
             if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
         ]
 
+    if ignored_files:
+        pgdata_files = [f for f in pgdata_files if f not in ignored_files]
+        restored_files = [f for f in restored_files if f not in ignored_files]
+
     # check that file sets are equal
     assert pgdata_files == restored_files
 
@@ -4266,6 +4324,17 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
         time.sleep(1)
 
 
+def log_replica_lag(primary: Endpoint, secondary: Endpoint):
+    last_replay_lsn = Lsn(
+        secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
+    )
+    primary_lsn = Lsn(
+        primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
+    )
+    lag = primary_lsn - last_replay_lsn
+    log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}")
+
+
 def wait_for_last_flush_lsn(
     env: NeonEnv,
     endpoint: Endpoint,
@@ -4411,3 +4480,79 @@ def parse_project_git_version_output(s: str) -> str:
         return commit
 
     raise ValueError(f"unable to parse --version output: '{s}'")
+
+
+def generate_uploads_and_deletions(
+    env: NeonEnv,
+    *,
+    init: bool = True,
+    tenant_id: Optional[TenantId] = None,
+    timeline_id: Optional[TimelineId] = None,
+    data: Optional[str] = None,
+    pageserver: NeonPageserver,
+):
+    """
+    Using the environment's default tenant + timeline, generate a load pattern
+    that results in some uploads and some deletions to remote storage.
+    """
+
+    if tenant_id is None:
+        tenant_id = env.initial_tenant
+    assert tenant_id is not None
+
+    if timeline_id is None:
+        timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
+    ps_http = pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
+    ) as endpoint:
+        if init:
+            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        def churn(data):
+            endpoint.safe_psql_many(
+                [
+                    f"""
+                INSERT INTO foo (id, val)
+                SELECT g, '{data}'
+                FROM generate_series(1, 200) g
+                ON CONFLICT (id) DO UPDATE
+                SET val = EXCLUDED.val
+                """,
+                    # to ensure that GC can actually remove some layers
+                    "VACUUM foo",
+                ]
+            )
+            assert tenant_id is not None
+            assert timeline_id is not None
+            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
+            # in a state where there are "future layers" in remote storage that will generate deletions
+            # after a restart.
+            last_flush_lsn_upload(
+                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
+            )
+
+        # Compaction should generate some GC-elegible layers
+        for i in range(0, 2):
+            churn(f"{i if data is None else data}")
+
+        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
+        print_gc_result(gc_result)
+        assert gc_result["layers_removed"] > 0
+
+        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
+        # ensures that the pageserver is in a fully idle state: there will be no more
+        # background ingest, no more uploads pending, and therefore no non-determinism
+        # in subsequent actions like pageserver restarts.
+        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+        # Finish uploads
+        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
+        # Finish all remote writes (including deletions)
+        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index e560844944..fa6e4eaafd 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -70,6 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     # this is expected given our collaborative shutdown approach for the UploadQueue
     ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
     ".*Compaction failed.*, retrying in .*: ShuttingDown",
+    ".*Compaction failed.*, retrying in .*: timeline shutting down.*",
     # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
     ".*Error processing HTTP request: NotFound: Timeline .* was not found",
     ".*took more than expected to complete.*",
@@ -88,9 +89,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Flushed oversized open layer with size.*",
     # During teardown, we stop the storage controller before the pageservers, so pageservers
     # can experience connection errors doing background deletion queue work.
-    ".*WARN deletion backend: calling control plane generation validation API failed.*Connection refused.*",
+    ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
     # Can happen when the test shuts down the storage controller while it is calling the utilization API
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
+    # Can happen during shutdown
+    ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Can happen during shutdown
+    ".*ignoring failure to find gc cutoffs: timeline shutting down.*",
 )
 
 
@@ -131,9 +136,10 @@ if __name__ == "__main__":
         "-i",
         "--input",
         type=argparse.FileType("r"),
-        default=sys.stdin,
-        help="Pageserver logs file. Reads from stdin if no file is provided.",
+        help="Pageserver logs file. Use '-' for stdin.",
+        required=True,
     )
+
     args = parser.parse_args()
     errors = _check_allowed_errors(args.input)
 
diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/common_types.py
similarity index 79%
rename from test_runner/fixtures/pageserver/types.py
rename to test_runner/fixtures/pageserver/common_types.py
index fd018cb778..a6c327a8a0 100644
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/common_types.py
@@ -2,7 +2,7 @@ import re
 from dataclasses import dataclass
 from typing import Any, Dict, Tuple, Union
 
-from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn
+from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn
 
 
 @dataclass
@@ -12,7 +12,7 @@ class IndexLayerMetadata:
 
 
 @dataclass(frozen=True)
-class ImageLayerFileName:
+class ImageLayerName:
     lsn: Lsn
     key_start: Key
     key_end: Key
@@ -26,7 +26,7 @@ class ImageLayerFileName:
 
 
 @dataclass(frozen=True)
-class DeltaLayerFileName:
+class DeltaLayerName:
     lsn_start: Lsn
     lsn_end: Lsn
     key_start: Key
@@ -41,14 +41,16 @@ class DeltaLayerFileName:
         return ret
 
 
-LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName]
+LayerName = Union[ImageLayerName, DeltaLayerName]
 
 
 class InvalidFileName(Exception):
     pass
 
 
-IMAGE_LAYER_FILE_NAME = re.compile("^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-[a-f0-9]{8})?$")
+IMAGE_LAYER_FILE_NAME = re.compile(
+    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$"
+)
 
 
 def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
@@ -62,7 +64,7 @@ def parse_image_layer(f_name: str) -> Tuple[int, int, int]:
 
 
 DELTA_LAYER_FILE_NAME = re.compile(
-    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-[a-f0-9]{8})?$"
+    "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$"
 )
 
 
@@ -80,16 +82,16 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
     )
 
 
-def parse_layer_file_name(file_name: str) -> LayerFileName:
+def parse_layer_file_name(file_name: str) -> LayerName:
     try:
         key_start, key_end, lsn = parse_image_layer(file_name)
-        return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end))
+        return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end))
     except InvalidFileName:
         pass
 
     try:
         key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name)
-        return DeltaLayerFileName(
+        return DeltaLayerName(
             lsn_start=Lsn(lsn_start),
             lsn_end=Lsn(lsn_end),
             key_start=Key(key_start),
@@ -101,18 +103,15 @@ def parse_layer_file_name(file_name: str) -> LayerFileName:
     raise InvalidFileName("neither image nor delta layer")
 
 
-def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
+def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn):
     """
     Determines if this layer file is considered to be in future meaning we will discard these
     layers during timeline initialization from the given disk_consistent_lsn.
     """
-    if (
-        isinstance(layer_file_name, ImageLayerFileName)
-        and layer_file_name.lsn > disk_consistent_lsn
-    ):
+    if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn:
         return True
     elif (
-        isinstance(layer_file_name, DeltaLayerFileName)
+        isinstance(layer_file_name, DeltaLayerName)
         and layer_file_name.lsn_end > disk_consistent_lsn + 1
     ):
         return True
@@ -122,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
 
 @dataclass
 class IndexPartDump:
-    layer_metadata: Dict[LayerFileName, IndexLayerMetadata]
+    layer_metadata: Dict[LayerName, IndexLayerMetadata]
     disk_consistent_lsn: Lsn
 
     @classmethod
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index b06972056c..f1f96f6d5f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -11,10 +11,10 @@ import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn
 
 
@@ -56,20 +56,30 @@ class InMemoryLayerInfo:
 class HistoricLayerInfo:
     kind: str
     layer_file_name: str
-    layer_file_size: Optional[int]
+    layer_file_size: int
     lsn_start: str
     lsn_end: Optional[str]
     remote: bool
+    # None for image layers, true if pageserver thinks this is an L0 delta layer
+    l0: Optional[bool]
 
     @classmethod
     def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
+        # instead of parsing the key range lets keep the definition of "L0" in pageserver
+        l0_ness = d.get("l0")
+        assert l0_ness is None or isinstance(l0_ness, bool)
+
+        size = d["layer_file_size"]
+        assert isinstance(size, int)
+
         return HistoricLayerInfo(
             kind=d["kind"],
             layer_file_name=d["layer_file_name"],
-            layer_file_size=d.get("layer_file_size"),
+            layer_file_size=size,
             lsn_start=d["lsn_start"],
             lsn_end=d.get("lsn_end"),
             remote=d["remote"],
+            l0=l0_ness,
         )
 
 
@@ -583,6 +593,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
+        wait_until_uploaded=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -590,6 +601,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_repartition"] = "true"
         if force_image_layer_creation:
             query["force_image_layer_creation"] = "true"
+        if wait_until_uploaded:
+            query["wait_until_uploaded"] = "true"
 
         log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
@@ -656,6 +669,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         timeline_id: TimelineId,
         force_repartition=False,
         force_image_layer_creation=False,
+        wait_until_uploaded=False,
     ):
         self.is_testing_enabled_or_skip()
         query = {}
@@ -663,6 +677,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_repartition"] = "true"
         if force_image_layer_creation:
             query["force_image_layer_creation"] = "true"
+        if wait_until_uploaded:
+            query["wait_until_uploaded"] = "true"
 
         log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
         res = self.put(
@@ -890,3 +906,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert current_logical_size == non_incremental
         assert isinstance(current_logical_size, int)
         return current_logical_size
+
+    def top_tenants(
+        self, order_by: str, limit: int, where_shards_lt: int, where_gt: int
+    ) -> dict[Any, Any]:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/top_tenants",
+            json={
+                "order_by": order_by,
+                "limit": limit,
+                "where_shards_lt": where_shards_lt,
+                "where_gt": where_gt,
+            },
+        )
+        self.verbose_error(res)
+        return res.json()  # type: ignore
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index f47a3ea043..def80a1c3e 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -3,6 +3,7 @@ import time
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.remote_storage
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -12,7 +13,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
 
 
 def single_timeline(
diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py
index e6cd9b4614..0c3612716a 100644
--- a/test_runner/fixtures/pageserver/remote_storage.py
+++ b/test_runner/fixtures/pageserver/remote_storage.py
@@ -6,13 +6,13 @@ import threading
 from pathlib import Path
 from typing import Any, List, Tuple
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import NeonEnv, Pagectl
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     InvalidFileName,
     parse_layer_file_name,
 )
 from fixtures.remote_storage import LocalFsStorage
-from fixtures.types import TenantId, TimelineId
 
 
 def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 4b0dd7a815..91435e8a1f 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -8,10 +8,10 @@ from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
 
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 77523a542b..0227285822 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -1,7 +1,8 @@
 import os
-from typing import Optional
+from typing import Any, Dict, Optional
 
 import pytest
+import toml
 from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
@@ -37,6 +38,20 @@ def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
     return None
 
 
+def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
+    if toml_table is None:
+        return None
+    v = toml.loads(toml_table)
+    assert isinstance(v, dict)
+    return v
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]:
+    return get_pageserver_default_tenant_config_compaction_algorithm()
+
+
 def pytest_generate_tests(metafunc: Metafunc):
     if (bt := os.getenv("BUILD_TYPE")) is None:
         build_types = ["debug", "release"]
@@ -60,6 +75,16 @@ def pytest_generate_tests(metafunc: Metafunc):
     ):
         metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
 
+    # Same hack for pageserver_default_tenant_config_compaction_algorithm
+    if (
+        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
+    ) is not None:
+        metafunc.parametrize(
+            "pageserver_default_tenant_config_compaction_algorithm",
+            [explicit_default],
+            ids=[explicit_default["kind"]],
+        )
+
     # For performance tests, parametrize also by platform
     if (
         "test_runner/performance" in metafunc.definition._nodeid
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 925e1b450f..ee18c53b52 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -12,8 +12,8 @@ import boto3
 import toml
 from mypy_boto3_s3 import S3Client
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import TenantId, TimelineId
 
 TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
 TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
@@ -50,7 +50,7 @@ class MockS3Server:
         # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
         # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
         # if a process is started from the shell process.
-        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"])
+        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"])
         error = None
         try:
             return_code = self.subprocess.poll()
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index b9c1986818..82148d0556 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import requests
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.types import Lsn, TenantId, TimelineId
 
 
 # Walreceiver as returned by sk's timeline status endpoint.
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
index 2818a493d6..0e4b5d7883 100644
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.safekeeper.http import SafekeeperHttpClient
-from fixtures.types import TenantId, TimelineId
 
 
 def are_walreceivers_absent(
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 6470621900..22bb43c580 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -4,10 +4,13 @@ import json
 import os
 import re
 import subprocess
+import tarfile
 import threading
 import time
+from hashlib import sha256
 from pathlib import Path
 from typing import (
+    IO,
     TYPE_CHECKING,
     Any,
     Callable,
@@ -15,8 +18,10 @@ from typing import (
     Iterable,
     List,
     Optional,
+    Set,
     Tuple,
     TypeVar,
+    Union,
 )
 from urllib.parse import urlencode
 
@@ -25,14 +30,14 @@ import zstandard
 from psycopg2.extensions import cursor
 
 from fixtures.log_helper import log
-from fixtures.pageserver.types import (
+from fixtures.pageserver.common_types import (
     parse_delta_layer,
     parse_image_layer,
 )
 
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import PgBin
-from fixtures.types import TimelineId
+from fixtures.common_types import TimelineId
 
 Fn = TypeVar("Fn", bound=Callable[..., Any])
 
@@ -452,6 +457,7 @@ def humantime_to_ms(humantime: str) -> float:
 
 
 def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py
     error_or_warn = re.compile(r"\s(ERROR|WARN)")
     errors = []
     for lineno, line in enumerate(input, start=1):
@@ -484,17 +490,62 @@ def assert_no_errors(log_file, service, allowed_errors):
     for _lineno, error in errors:
         log.info(f"not allowed {service} error: {error.strip()}")
 
-    assert not errors, f"Log errors on {service}: {errors[0]}"
+    assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"
 
 
 @enum.unique
 class AuxFileStore(str, enum.Enum):
-    V1 = "V1"
-    V2 = "V2"
-    CrossValidation = "CrossValidation"
+    V1 = "v1"
+    V2 = "v2"
+    CrossValidation = "cross-validation"
 
     def __repr__(self) -> str:
         return f"'aux-{self.value}'"
 
     def __str__(self) -> str:
         return f"'aux-{self.value}'"
+
+
+def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]):
+    """
+    This is essentially:
+
+    lines=$(comm -3 \
+        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        | wc -l)
+    [ "$lines" = "0" ]
+
+    But in a more mac friendly fashion.
+    """
+    started_at = time.time()
+
+    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
+        assert reader is not None
+        digest = sha256(usedforsecurity=False)
+        while True:
+            buf = reader.read(64 * 1024)
+            if not buf:
+                break
+            digest.update(buf)
+        return digest.digest()
+
+    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
+        with tarfile.open(p) as f:
+            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
+            ret = list(
+                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
+            )
+            ret.sort(key=lambda t: t[0])
+            return ret
+
+    left_list, right_list = map(build_hash_list, [left, right])
+
+    try:
+        assert len(left_list) == len(right_list)
+
+        for left_tuple, right_tuple in zip(left_list, right_list):
+            assert left_tuple == right_tuple
+    finally:
+        elapsed = time.time() - started_at
+        log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index c44628ce06..dfd9caba3e 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -1,6 +1,7 @@
 import threading
 from typing import Any, Optional
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.types import TenantId, TimelineId
 
 # neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
 # to ensure we don't do that: this enables running lots of Workloads in parallel safely.
diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
new file mode 100644
index 0000000000..644c1f559b
--- /dev/null
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -0,0 +1,175 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.remote_storage import s3_storage
+from fixtures.utils import humantime_to_ms
+
+
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("io_engine", ["tokio-epoll-uring", "std-fs"])
+@pytest.mark.parametrize("concurrency_per_target", [1, 10, 100])
+@pytest.mark.timeout(1000)
+def test_download_churn(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    io_engine: str,
+    concurrency_per_target: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(metric_name=f"pageserver_ondemand_download_churn.{metric}", **kwargs)
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            # we don't capture `duration`, but instead use the `runtime` output field from pagebench
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+
+    # Setup env
+    env = setup_env(neon_env_builder, pg_bin)
+    env.pageserver.allowed_errors.append(
+        f".*path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
+
+
+def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    # We configure tenant conf such that SQL query below produces a lot of layers.
+    # We don't care what's in the layers really, we just care that layers are created.
+    bytes_per_layer = 10 * (1024**2)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "pitr_interval": "1000d",  # let's not make it get in the way
+            "gc_period": "0s",  # disable periodic gc to avoid noise
+            "compaction_period": "0s",  # disable L0=>L1 compaction
+            "checkpoint_timeout": "10years",  # rely solely on checkpoint_distance
+            "checkpoint_distance": bytes_per_layer,  # 10M instead of 256M to create more smaller layers
+            "image_creation_threshold": 100000,  # don't create image layers ever
+        }
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    client = env.pageserver.http_client()
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        ep.safe_psql("CREATE TABLE data (random_text text)")
+        bytes_per_row = 512  # make big enough so WAL record size doesn't dominate
+        desired_layers = 300
+        desired_bytes = bytes_per_layer * desired_layers
+        nrows = desired_bytes / bytes_per_row
+        ep.safe_psql(
+            f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)})  as i",
+            options="-c statement_timeout=0",
+        )
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+    # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
+    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
+
+    return env
+
+
+def run_benchmark(
+    env: NeonEnv,
+    pg_bin: PgBin,
+    record,
+    io_engine: str,
+    concurrency_per_target: int,
+    duration_secs: int,
+):
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "ondemand-download-churn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--runtime",
+        f"{duration_secs}s",
+        "--set-io-engine",
+        f"{io_engine}",
+        "--concurrency-per-target",
+        f"{concurrency_per_target}",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    metric = "downloads_count"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "downloads_bytes"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="byte",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "evictions_count"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "timeline_restarts"
+    record(
+        metric,
+        metric_value=results[metric],
+        unit="",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "runtime"
+    record(
+        metric,
+        metric_value=humantime_to_ms(results[metric]) / 1000,
+        unit="s",
+        report=MetricReport.TEST_PARAM,
+    )
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 009d62c9ba..f31cd9a9f8 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -5,13 +5,13 @@ Utilities used by all code in this sub-directory
 from typing import Any, Callable, Dict, Tuple
 
 import fixtures.pageserver.many_tenants as many_tenants
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import wait_until_all_tenants_state
-from fixtures.types import TenantId, TimelineId
 
 
 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 7687b8417f..b3866f1813 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -9,11 +9,11 @@ from typing import List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonPageserver
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index 1df3f2f5f1..3f56da7c1d 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -2,10 +2,10 @@ from contextlib import closing
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn
 
 
 #
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
new file mode 100644
index 0000000000..9cd83f0959
--- /dev/null
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -0,0 +1,280 @@
+import concurrent.futures
+import re
+from pathlib import Path
+
+import pytest
+from fixtures.common_types import TenantId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    tenant_get_shards,
+)
+
+
+@pytest.mark.timeout(600)
+def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Check that sharding, including auto-splitting, "just works" under pgbench workloads.
+
+    This is not a benchmark, but it lives in the same place as benchmarks in order to be run
+    on a dedicated node that can sustain some significant throughput.
+
+    Other tests validate the details of shard splitting, error cases etc.  This test is
+    the sanity check that it all really works as expected with realistic amounts of data
+    and under load.
+
+    Success conditions:
+    - Tenants auto-split when their capacity grows
+    - Client workloads are not interrupted while that happens
+    """
+
+    neon_env_builder.num_pageservers = 8
+    neon_env_builder.storage_controller_config = {
+        # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical
+        # sizes, physical sizes, etc).  We will write this much data logically, therefore other sizes
+        # will reliably be greater.
+        "split_threshold": 1024 * 1024 * 500
+    }
+
+    tenant_conf = {
+        # We want layer rewrites to happen as soon as possible (this is the most stressful
+        # case for the system), so set PITR interval to something tiny.
+        "pitr_interval": "5s",
+        # Scaled down thresholds.  We will run at ~1GB scale but would like to emulate
+        # the behavior of a system running at ~100GB scale.
+        "checkpoint_distance": f"{1024 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{1024 * 1024}",
+        "image_creation_threshold": "2",
+        "image_layer_creation_check_threshold": "0",
+    }
+
+    env = neon_env_builder.init_start()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.extend(
+            [
+                # We shut down pageservers while they might have some compaction work going on
+                ".*Compaction failed.*shutting down.*"
+            ]
+        )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # The neon_local functionality for updating computes is flaky for unknown reasons
+            ".*Local notification hook failed.*",
+            ".*Marking shard.*for notification retry.*",
+            ".*Failed to notify compute.*",
+        ]
+    )
+
+    # Total tenants
+    tenant_count = 4
+
+    # Transaction rate: we set this rather than running at full-speed because we
+    # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently.
+    transaction_rate = 100
+
+    class TenantState:
+        def __init__(self, timeline_id, endpoint):
+            self.timeline_id = timeline_id
+            self.endpoint = endpoint
+
+    # Create tenants
+    tenants = {}
+    for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)):
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf)
+        endpoint = env.endpoints.create("main", tenant_id=tenant_id)
+        tenants[tenant_id] = TenantState(timeline_id, endpoint)
+        endpoint.start()
+
+    def run_pgbench_init(endpoint):
+        pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-i",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+    def check_pgbench_output(out_path: str):
+        """
+        When we run pgbench, we want not just an absence of errors, but also continuous evidence
+        of I/O progressing: our shard splitting and migration should not interrrupt the benchmark.
+        """
+        matched_lines = 0
+        stderr = Path(f"{out_path}.stderr").read_text()
+
+        low_watermark = None
+
+        # Apply this as a threshold for what we consider an unacceptable interruption to I/O
+        min_tps = transaction_rate // 10
+
+        for line in stderr.split("\n"):
+            match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line)
+            if match is None:
+                # Fall back to older-version pgbench output (omits failure count)
+                match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .*", line)
+                if match is None:
+                    continue
+                else:
+                    (_time, tps) = match.groups()
+                    tps = float(tps)
+                    failed = 0
+            else:
+                (_time, tps, failed) = match.groups()  # type: ignore
+                tps = float(tps)
+                failed = int(failed)
+
+            matched_lines += 1
+
+            if failed > 0:
+                raise RuntimeError(
+                    f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0"
+                )
+
+            if low_watermark is None or low_watermark > tps:
+                low_watermark = tps
+
+            # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not
+            # at the same time as a shard split.
+            # if tps < min_tps:
+            #     raise RuntimeError(
+            #         f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}"
+            #     )
+
+        log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}")
+
+        if matched_lines == 0:
+            raise RuntimeError(f"pgbench output at {out_path} contained no progress lines")
+
+    def run_pgbench_main(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "180",
+                "-R",
+                f"{transaction_rate}",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    def run_pgbench_read(endpoint):
+        out_path = pg_bin.run_capture(
+            [
+                "pgbench",
+                "-s50",
+                "-T",
+                "30",
+                "-R",
+                f"{transaction_rate}",
+                "-S",
+                "-P",
+                "1",
+                f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres",
+            ]
+        )
+
+        check_pgbench_output(out_path)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench inits")
+        for fut in pgbench_futs:
+            fut.result()
+
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read/write pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    def assert_all_split():
+        for tenant_id in tenants.keys():
+            shards = tenant_get_shards(env, tenant_id)
+            assert len(shards) == 8
+
+    # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise
+    # this test is not properly doing its job of validating that splits work nicely under load.
+    assert_all_split()
+
+    env.storage_controller.assert_log_contains(".*Successful auto-split.*")
+
+    # Log timeline sizes, useful for debug, and implicitly validates that the shards
+    # are available in the places the controller thinks they should be.
+    for tenant_id, tenant_state in tenants.items():
+        (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0]
+        timeline_info = shard_zero_ps.http_client().timeline_detail(
+            shard_zero_id, tenant_state.timeline_id
+        )
+        log.info(f"{shard_zero_id} timeline: {timeline_info}")
+
+    # Run compaction for all tenants, restart endpoint so that on subsequent reads we will
+    # definitely hit pageserver for reads.  This compaction passis expected to drop unwanted
+    # layers but not do any rewrites (we're still in the same generation)
+    for tenant_id, tenant_state in tenants.items():
+        tenant_state.endpoint.stop()
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+            shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id)
+        tenant_state.endpoint.start()
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    env.storage_controller.consistency_check()
+
+    # Restart the storage controller
+    env.storage_controller.stop()
+    env.storage_controller.start()
+
+    env.storage_controller.consistency_check()
+
+    # Restart all pageservers
+    for ps in env.pageservers:
+        ps.stop()
+        ps.start()
+
+    # Freshen gc_info in Timeline, so that when compaction runs in the background in the
+    # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass.
+    for tenant_id, tenant_state in tenants.items():
+        for shard_id, shard_ps in tenant_get_shards(env, tenant_id):
+            shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None)
+
+    # One last check data remains readable after everything has restarted
+    with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads:
+        pgbench_futs = []
+        for tenant_state in tenants.values():
+            fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint)
+            pgbench_futs.append(fut)
+
+        log.info("Waiting for pgbench read pass")
+        for fut in pgbench_futs:
+            fut.result()
+
+    # Assert that some rewrites happened
+    # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged
+    # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers)
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 17dc96dabe..cb013ae8c3 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -3,6 +3,7 @@ import random
 import time
 
 import pytest
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TenantShardId, TimelineId
 
 
 @pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
@@ -102,6 +102,9 @@ def test_storage_controller_many_tenants(
                 tenant_id,
                 shard_count,
                 stripe_size,
+                # Upload heatmaps fast, so that secondary downloads happen promptly, enabling
+                # the controller's optimization migrations to proceed promptly.
+                tenant_config={"heatmap_period": "10s"},
                 placement_policy={"Attached": 1},
             )
             futs.append(f)
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 7eb244d378..513ebc74c3 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -6,10 +6,10 @@ from typing import Any, Callable, List
 
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
 from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin
-from fixtures.types import Lsn
 
 from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
 
diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
index 50243e3ea7..edf2a01337 100644
--- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
+++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj
@@ -8,7 +8,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Npgsql" Version="8.0.2" />
+    <PackageReference Include="Npgsql" Version="8.0.3" />
   </ItemGroup>
 
 </Project>
diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py
index d16d2d6a24..7e40081aa2 100644
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 693add422f..8c60b454d8 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -2,13 +2,13 @@ from dataclasses import dataclass
 from typing import Generator, Optional
 
 import pytest
+from fixtures.common_types import TenantId
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverApiException, TenantConfig
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import TenantId
 from fixtures.utils import wait_until
 
 
@@ -162,7 +162,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "checkpoint_distance": 10000,
         "checkpoint_timeout": "13m",
         "compaction_algorithm": {
-            "kind": "Tiered",
+            "kind": "tiered",
         },
         "eviction_policy": {
             "kind": "LayerAccessThreshold",
@@ -190,7 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "trace_read_requests": True,
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
-        "switch_aux_file_policy": "CrossValidation",
+        "switch_aux_file_policy": "cross-validation",
     }
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index bb622c0d59..035ab2796f 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -4,13 +4,13 @@ from pathlib import Path
 
 import psycopg2
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PgProtocol,
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.types import TenantId, TimelineId
 
 
 def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
new file mode 100644
index 0000000000..5328aef156
--- /dev/null
+++ b/test_runner/regress/test_aux_files.py
@@ -0,0 +1,76 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    AuxFileStore,
+    NeonEnvBuilder,
+    logical_replication_sync,
+)
+
+
+def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start("main")
+    client = env.pageserver.http_client()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    tenant_config = client.tenant_config(tenant_id).effective_config
+    tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
+    client.set_tenant_config(tenant_id, tenant_config)
+    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
+    assert (
+        client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
+        is None
+    )
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("create table t(pk integer primary key, payload integer)")
+    cur.execute(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
+    )
+    cur.execute("create publication pub1 for table t, replication_example")
+
+    # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
+    # instead of going through the full logical replication process.
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
+    vanilla_pg.safe_psql(
+        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
+    )
+    connstr = endpoint.connstr().replace("'", "''")
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    # Wait logical replication channel to be established
+    logical_replication_sync(vanilla_pg, endpoint)
+    vanilla_pg.stop()
+    endpoint.stop()
+
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be enabled at this point
+        assert (
+            client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
+            == AuxFileStore.V2
+        )
+    with env.pageserver.http_client() as client:
+        tenant_config = client.tenant_config(tenant_id).effective_config
+        tenant_config["switch_aux_file_policy"] = "V1"
+        client.set_tenant_config(tenant_id, tenant_config)
+        # the flag should still be enabled
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == AuxFileStore.V2
+        )
+    env.pageserver.restart()
+    with env.pageserver.http_client() as client:
+        # aux file v2 flag should be persisted
+        assert (
+            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
+                "last_aux_file_policy"
+            ]
+            == AuxFileStore.V2
+        )
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index ddd02238ea..eb503ddbfa 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -2,10 +2,10 @@ import threading
 import time
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py
index b79cad979f..0a5336f5a2 100644
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -1,8 +1,8 @@
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
@@ -11,8 +11,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_branch_behind(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
 
     error_regexes = [
         ".*invalid branch start lsn.*",
diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py
index 9fe9f77fea..03d6946c15 100644
--- a/test_runner/regress/test_branching.py
+++ b/test_runner/regress/test_branching.py
@@ -5,6 +5,7 @@ from concurrent.futures import ThreadPoolExecutor
 from typing import List
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -14,7 +15,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import wait_until_tenant_active
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar
 from performance.test_perf_pgbench import get_scales_matrix
 from requests import RequestException
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 1279c1bf81..61afd820ca 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -3,6 +3,7 @@ import os
 from typing import List, Tuple
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -11,7 +12,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TimelineId
 
 
 # Test restarting page server, while safekeeper and compute node keep
@@ -56,14 +56,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     (tenant0, timeline0, pg0) = tenant_timelines[0]
     log.info(f"Timeline {tenant0}/{timeline0} is left intact")
 
-    (tenant1, timeline1, pg1) = tenant_timelines[1]
-    metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata"
-    with open(metadata_path, "w") as f:
-        f.write("overwritten with garbage!")
-    log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled")
-
-    (tenant2, timeline2, pg2) = tenant_timelines[2]
-    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/"
+    (tenant1, timeline1, pg1) = tenant_timelines[2]
+    timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/"
     for filename in os.listdir(timeline_path):
         if filename.startswith("00000"):
             # Looks like a layer file. Corrupt it
@@ -72,7 +66,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
             with open(p, "wb") as f:
                 f.truncate(0)
                 f.truncate(size)
-    log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled")
+    log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled")
 
     env.pageserver.start()
 
@@ -80,19 +74,15 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     pg0.start()
     assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
 
-    # Tenant with corrupt local metadata works: remote storage is authoritative for metadata
-    pg1.start()
-    assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100
-
     # Second timeline will fail during basebackup, because the local layer file is corrupt.
     # It will fail when we try to read (and reconstruct) a page from it, ergo the error message.
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
     with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
-        pg2.start()
+        pg1.start()
     log.info(
-        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
+        f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}"
     )
 
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 43a3323462..4850a5c688 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -1,10 +1,12 @@
+import enum
 import json
 import os
 from typing import Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.workload import Workload
 
 AGGRESIVE_COMPACTION_TENANT_CONF = {
@@ -163,7 +165,6 @@ def test_sharding_compaction(
                 image_layer_sizes[layer.layer_file_name] = layer.layer_file_size
 
                 # Pageserver should assert rather than emit an empty layer file, but double check here
-                assert layer.layer_file_size is not None
                 assert layer.layer_file_size > 0
 
         shard_has_image_layers.append(len(image_layer_sizes) > 1)
@@ -176,7 +177,7 @@ def test_sharding_compaction(
             #
             # We only do this check with tiny stripes, because large stripes may not give all shards enough
             # data to have statistically significant image layers
-            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)  # type: ignore
+            avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes)
             log.info(f"Shard {shard_id} average image layer size: {avg_size}")
             assert avg_size > compaction_target_size / 2
 
@@ -190,3 +191,61 @@ def test_sharding_compaction(
 
     # Assert that everything is still readable
     workload.validate()
+
+
+class CompactionAlgorithm(str, enum.Enum):
+    LEGACY = "legacy"
+    TIERED = "tiered"
+
+
+@pytest.mark.parametrize(
+    "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED]
+)
+def test_uploads_and_deletions(
+    neon_env_builder: NeonEnvBuilder,
+    compaction_algorithm: CompactionAlgorithm,
+):
+    """
+    :param compaction_algorithm: the compaction algorithm to use.
+    """
+
+    tenant_conf = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": f"{128 * 1024}",
+        "compaction_threshold": "1",
+        "compaction_target_size": f"{128 * 1024}",
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "0s",
+        # disable background compaction and GC. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # create image layers eagerly, so that GC can remove some layers
+        "image_creation_threshold": "1",
+        "image_layer_creation_check_threshold": "0",
+        "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}),
+    }
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    # TODO remove these allowed errors
+    # https://github.com/neondatabase/neon/issues/7707
+    # https://github.com/neondatabase/neon/issues/7759
+    allowed_errors = [
+        ".*duplicated L1 layer.*",
+        ".*delta layer created with.*duplicate values.*",
+        ".*assertion failed: self.lsn_range.start <= lsn.*",
+        ".*HTTP request handler task panicked: task.*panicked.*",
+    ]
+    if compaction_algorithm == CompactionAlgorithm.TIERED:
+        env.pageserver.allowed_errors.extend(allowed_errors)
+
+    try:
+        generate_uploads_and_deletions(env, pageserver=env.pageserver)
+    except PageserverApiException as e:
+        log.info(f"Obtained PageserverApiException: {e}")
+
+    # The errors occur flakily and no error is ensured to occur,
+    # however at least one of them occurs.
+    if compaction_algorithm == CompactionAlgorithm.TIERED:
+        found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors)
+        if not found_allowed_error:
+            raise Exception("None of the allowed_errors occured in the log")
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 787c114fc1..65649e0c0a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -1,4 +1,5 @@
 import os
+import re
 import shutil
 import subprocess
 import tempfile
@@ -7,6 +8,7 @@ from typing import List, Optional
 
 import pytest
 import toml
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -21,7 +23,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
 
 #
 # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
@@ -245,14 +246,34 @@ def test_forward_compatibility(
             compatibility_snapshot_dir / "repo",
         )
 
+        # not using env.pageserver.version because it was initialized before
+        prev_pageserver_version_str = env.get_binary_version("pageserver")
+        prev_pageserver_version_match = re.search(
+            "Neon page server git-env:(.*) failpoints: (.*), features: (.*)",
+            prev_pageserver_version_str,
+        )
+        if prev_pageserver_version_match is not None:
+            prev_pageserver_version = prev_pageserver_version_match.group(1)
+        else:
+            raise AssertionError(
+                "cannot find git hash in the version string: " + prev_pageserver_version_str
+            )
+
+        # does not include logs from previous runs
+        assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+
         neon_env_builder.start()
 
+        # ensure the specified pageserver is running
+        assert env.pageserver.log_contains("git-env:" + prev_pageserver_version)
+
         check_neon_works(
             env,
             test_output_dir=test_output_dir,
             sql_dump_path=compatibility_snapshot_dir / "dump.sql",
             repo_dir=env.repo_dir,
         )
+
     except Exception:
         if breaking_changes_allowed:
             pytest.xfail(
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
new file mode 100644
index 0000000000..dd36190fcd
--- /dev/null
+++ b/test_runner/regress/test_compute_catalog.py
@@ -0,0 +1,34 @@
+import requests
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_compute_catalog(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_config", "empty")
+
+    endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"])
+    client = endpoint.http_client()
+
+    objects = client.dbs_and_roles()
+
+    # Assert that 'cloud_admin' role exists in the 'roles' list
+    assert any(
+        role["name"] == "cloud_admin" for role in objects["roles"]
+    ), "The 'cloud_admin' role is missing"
+
+    # Assert that 'postgres' database exists in the 'databases' list
+    assert any(
+        db["name"] == "postgres" for db in objects["databases"]
+    ), "The 'postgres' database is missing"
+
+    ddl = client.database_schema(database="postgres")
+
+    assert "-- PostgreSQL database dump" in ddl
+
+    try:
+        client.database_schema(database="nonexistentdb")
+        raise AssertionError("Expected HTTPError was not raised")
+    except requests.exceptions.HTTPError as e:
+        assert (
+            e.response.status_code == 404
+        ), f"Expected 404 status code, but got {e.response.status_code}"
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 5e9efa7cce..7ae2352c06 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 from typing import Any, Dict, Iterable, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -16,7 +17,6 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
@@ -623,15 +623,16 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
             ratio = count_now / original_count
             abs_diff = abs(ratio - expected_ratio)
             assert original_count > count_now
-            log.info(
-                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < 0.1"
-            )
 
+            expectation = 0.06
+            log.info(
+                f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
+            )
             # in this test case both relative_spare and relative_equal produce
             # the same outcomes; this must be a quantization effect of similar
             # sizes (-s4 and -s6) and small (5MB) layer size.
             # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
-            assert abs_diff < 0.05
+            assert abs_diff < expectation
 
 
 @pytest.mark.parametrize(
diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py
index 7471338ce5..0ebb99c712 100644
--- a/test_runner/regress/test_duplicate_layers.py
+++ b/test_runner/regress/test_duplicate_layers.py
@@ -2,7 +2,7 @@ import time
 
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
     wait_for_upload_queue_empty,
diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py
index d5f898492b..e6d51a77a6 100644
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -8,7 +9,6 @@ from fixtures.neon_fixtures import (
     VanillaPostgres,
 )
 from fixtures.port_distributor import PortDistributor
-from fixtures.types import Lsn, TimelineId
 from fixtures.utils import query_scalar, subprocess_capture
 
 num_rows = 1000
@@ -19,17 +19,16 @@ def test_fullbackup(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
     port_distributor: PortDistributor,
-    pg_distrib_dir: Path,
     test_output_dir: Path,
 ):
     env = neon_env_builder.init_start()
 
-    env.neon_cli.create_branch("test_fullbackup")
-    endpoint_main = env.endpoints.create_start("test_fullbackup")
+    # endpoint needs to be alive until the fullbackup so that we have
+    # prev_record_lsn for the vanilla_pg to start in read-write mode
+    # for some reason this does not happen if endpoint is shutdown.
+    endpoint_main = env.endpoints.create_start("main")
 
     with endpoint_main.cursor() as cur:
-        timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
-
         # data loading may take a while, so increase statement timeout
         cur.execute("SET statement_timeout='300s'")
         cur.execute(
@@ -41,17 +40,13 @@ def test_fullbackup(
         lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
         log.info(f"start_backup_lsn = {lsn}")
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Get and unpack fullbackup from pageserver
     restored_dir_path = env.repo_dir / "restored_datadir"
     os.mkdir(restored_dir_path, 0o750)
-    query = f"fullbackup {env.initial_tenant} {timeline} {lsn}"
     tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file
+    )
     subprocess_capture(
         env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)]
     )
@@ -61,7 +56,7 @@ def test_fullbackup(
     # use resetwal to overwrite it
     pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
     cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)
 
     # Restore from the backup and find the data we inserted
     port = port_distributor.get_port()
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index c5070ee815..44133f2350 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -2,6 +2,7 @@ import asyncio
 import concurrent.futures
 import random
 
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -10,7 +11,6 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TimelineId
 
 # Test configuration
 #
@@ -67,8 +67,7 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
 #
 def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
     endpoint = env.endpoints.create_start("test_gc_aggressive")
 
@@ -94,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
 
 #
 def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
-    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
     num_index_uploads = 0
 
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    env = neon_env_builder.init_start()
+    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     tenant_id = env.initial_tenant
     timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
     endpoint = env.endpoints.create_start("test_gc_index_upload")
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 179cc273ec..cf7a1c56ee 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,9 +1,21 @@
+import asyncio
 import os
 import re
+import threading
 import time
+from functools import partial
 
+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    log_replica_lag,
+    tenant_get_shards,
+    wait_replica_caughtup,
+)
+from fixtures.utils import wait_until
 
 
 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -104,19 +116,28 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
                 wait_replica_caughtup(primary, secondary2)
 
 
-# We had an issue that a standby server made GetPage requests with an
-# old LSN, based on the last-written LSN cache, to avoid waits in the
-# pageserver.  However, requesting a page with a very old LSN, such
-# that the GC horizon has already advanced past it, results in an
-# error from the pageserver:
-# "Bad request: tried to request a page version that was garbage collected"
+# Test two different scenarios related to gc of data needed by hot standby.
 #
-# To avoid that, the compute<-> pageserver protocol was updated so
-# that that the standby now sends two LSNs, the old last-written LSN
-# and the current replay LSN.
+# When pause_apply is False, standby is mostly caught up with the primary.
+# However, in compute <-> pageserver protocol version 1 only one LSN had been
+# sent to the pageserver in page request, and to avoid waits in the pageserver
+# it was last-written LSN cache value. If page hasn't been updated for a long
+# time that resulted in an error from the pageserver: "Bad request: tried to
+# request a page version that was garbage collected". For primary this wasn't a
+# problem because pageserver always bumped LSN to the newest one; for standy
+# that would be incorrect since we might get page fresher then apply LSN. Hence,
+# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN
+# in case of standby) and not_modified_since which could be used as an
+# optimization to avoid waiting.
 #
 # https://github.com/neondatabase/neon/issues/6211
-def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
+#
+# When pause_apply is True we model standby lagging behind primary (e.g. due to
+# high max_standby_streaming_delay). To prevent pageserver from removing data
+# still needed by the standby apply LSN is propagated in standby -> safekeepers
+# -> broker -> pageserver flow so that pageserver could hold off gc for it.
+@pytest.mark.parametrize("pause_apply", [False, True])
+def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
     tenant_conf = {
         # set PITR interval to be small, so we can do GC
         "pitr_interval": "0 s",
@@ -160,6 +181,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
             # so we still remember the LSNs of the pages.
             s_cur.execute("SELECT clear_buffer_cache()")
 
+            if pause_apply:
+                s_cur.execute("SELECT pg_wal_replay_pause()")
+
             # Do other stuff on the primary, to advance the WAL
             p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")
 
@@ -176,6 +200,155 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
             # generates use old not_modified_since LSNs, older than
             # the GC cutoff, but new request LSNs. (In protocol
             # version 1 there was only one LSN, and this failed.)
+            log_replica_lag(primary, secondary)
             s_cur.execute("SELECT COUNT(*) FROM test")
+            log_replica_lag(primary, secondary)
             res = s_cur.fetchone()
             assert res[0] == 10000
+
+
+def run_pgbench(connstr: str, pg_bin: PgBin):
+    log.info(f"Start a pgbench workload on pg {connstr}")
+    # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
+    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
+    log.info("pgbench init done")
+    pg_bin.run_capture(["pgbench", "-T60", connstr])
+
+
+# assert that pgbench_accounts and its index are created.
+def pgbench_accounts_initialized(ep):
+    ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass")
+
+
+# Test that hot_standby_feedback works in neon (it is forwarded through
+# safekeepers). That is, ensure queries on standby don't fail during load on
+# primary under the following conditions:
+# - pgbench bombards primary with updates.
+# - On the secondary we run long select of the updated table.
+# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts
+#   so apply doesn't need to wait.
+# - Do agressive vacuum on primary which still shouldn't create conflicts.
+#   Actually this appears to be redundant due to microvacuum existence.
+#
+# Without hs feedback enabled we'd see 'User query might have needed to see row
+# versions that must be removed.' errors.
+def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+    agressive_vacuum_conf = [
+        "log_autovacuum_min_duration = 0",
+        "autovacuum_naptime = 10s",
+        "autovacuum_vacuum_threshold = 25",
+        "autovacuum_vacuum_scale_factor = 0.1",
+        "autovacuum_vacuum_cost_delay = -1",
+    ]
+    with env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf
+    ) as primary:
+        # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with
+        # 'User was holding shared buffer pin for too long.'.
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            config_lines=[
+                "max_standby_streaming_delay=2s",
+                "neon.protocol_version=2",
+                "hot_standby_feedback=true",
+            ],
+        ) as secondary:
+            log.info(
+                f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
+            )
+            t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
+            t.start()
+            # Wait until pgbench_accounts is created + filled on replica *and*
+            # index is created. Otherwise index creation would conflict with
+            # read queries and hs feedback won't save us.
+            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
+
+            # Test should fail if hs feedback is disabled anyway, but cross
+            # check that walproposer sets some xmin.
+            def xmin_is_not_null():
+                slot_xmin = primary.safe_psql_scalar(
+                    "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                    log_query=False,
+                )
+                log.info(f"xmin is {slot_xmin}")
+                assert int(slot_xmin) > 0
+
+            wait_until(10, 1.0, xmin_is_not_null)
+            for _ in range(1, 5):
+                # in debug mode takes about 5-7s
+                balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
+                log.info(f"balance={balance}")
+                log_replica_lag(primary, secondary)
+            t.join()
+
+        # check xmin is reset when standby is gone
+        def xmin_is_null():
+            slot_xmin = primary.safe_psql_scalar(
+                "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                log_query=False,
+            )
+            log.info(f"xmin is {slot_xmin}")
+            assert slot_xmin is None
+
+        wait_until(10, 1.0, xmin_is_null)
+
+
+# Test race condition between WAL replay and backends performing queries
+# https://github.com/neondatabase/neon/issues/7791
+def test_replica_query_race(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    primary_ep = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    )
+
+    with primary_ep.connect() as p_con:
+        with p_con.cursor() as p_cur:
+            p_cur.execute("CREATE EXTENSION neon_test_utils")
+            p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
+
+    standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
+    time.sleep(1)
+
+    # In primary, run a lot of UPDATEs on a single page
+    finished = False
+    writecounter = 1
+
+    async def primary_workload():
+        nonlocal writecounter, finished
+        conn = await primary_ep.connect_async()
+        while writecounter < 10000:
+            writecounter += 1
+            await conn.execute(f"UPDATE test SET counter = {writecounter}")
+        finished = True
+
+    # In standby, at the same time, run queries on it. And repeatedly drop caches
+    async def standby_workload():
+        nonlocal writecounter, finished
+        conn = await standby_ep.connect_async()
+        reads = 0
+        while not finished:
+            readcounter = await conn.fetchval("SELECT counter FROM test")
+
+            # Check that the replica is keeping up with the primary. In local
+            # testing, the lag between primary and standby is much smaller, in
+            # the ballpark of 2-3 counter values. But be generous in case there's
+            # some hiccup.
+            # assert(writecounter - readcounter < 1000)
+            assert readcounter <= writecounter
+            if reads % 100 == 0:
+                log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
+            reads += 1
+
+            await conn.execute("SELECT clear_buffer_cache()")
+
+    async def both():
+        await asyncio.gather(
+            primary_workload(),
+            standby_workload(),
+        )
+
+    asyncio.run(both())
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 132427ba2d..62229ebfe7 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -7,6 +7,7 @@ from contextlib import closing
 from pathlib import Path
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -20,8 +21,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import subprocess_capture
+from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture
 
 
 def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
@@ -248,15 +248,9 @@ def _import(
     path to the backup archive file"""
     log.info(f"start_backup_lsn = {lsn}")
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Get a fullbackup from pageserver
-    query = f"fullbackup { env.initial_tenant} {timeline} {lsn}"
     tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file)
 
     # Stop the first pageserver instance, erase all its data
     env.endpoints.stop_all()
@@ -305,22 +299,11 @@ def _import(
     assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]
 
     # Take another fullbackup
-    query = f"fullbackup { tenant} {timeline} {lsn}"
     new_tar_output_file = test_output_dir / "fullbackup-new.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        query,
-        "-o",
-        str(new_tar_output_file),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file)
 
     # Check it's the same as the first fullbackup
-    # TODO pageserver should be checking checksum
-    assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
+    assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set())
 
     # Check that gc works
     pageserver_http = env.pageserver.http_client()
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 5c967fd72e..193149ea03 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -7,7 +7,7 @@ from fixtures.neon_fixtures import (
     flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
 
@@ -159,7 +159,9 @@ def test_basic_eviction(
 def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
-    env = neon_env_builder.init_start()
+    # don't create initial tenant, we'll create it manually with custom config
+    env = neon_env_builder.init_configs()
+    env.start()
 
     tenant_config = {
         "pitr_interval": "1s",  # set to non-zero, so GC actually does something
@@ -270,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
             resident_physical_size_metric == 0
         ), "ensure that resident_physical_size metric is zero"
         assert resident_physical_size_metric == sum(
-            layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote
+            layer.layer_file_size for layer in info.historic_layers if not layer.remote
         ), "ensure that resident_physical_size metric corresponds to layer map dump"
 
         remote_physical_size_metric = ps_http.get_timeline_metric(
             tenant_id, timeline_id, "pageserver_remote_physical_size"
         )
         assert remote_physical_size_metric == sum(
-            layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
+            layer.layer_file_size for layer in info.historic_layers if layer.remote
         ), "ensure that remote_physical_size metric corresponds to layer map dump"
 
     log.info("before runnning GC, ensure that remote_physical size is zero")
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index f311a8bf2c..18e5111786 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -1,10 +1,11 @@
 import time
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
-from fixtures.pageserver.types import (
-    DeltaLayerFileName,
-    ImageLayerFileName,
+from fixtures.pageserver.common_types import (
+    DeltaLayerName,
+    ImageLayerName,
     is_future_layer,
 )
 from fixtures.pageserver.utils import (
@@ -13,7 +14,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -81,7 +81,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     current = get_index_part()
     assert len(set(current.layer_metadata.keys())) == 1
     layer_file_name = list(current.layer_metadata.keys())[0]
-    assert isinstance(layer_file_name, DeltaLayerFileName)
+    assert isinstance(layer_file_name, DeltaLayerName)
     assert layer_file_name.is_l0(), f"{layer_file_name}"
 
     log.info("force image layer creation in the future by writing some data into in-memory layer")
@@ -146,7 +146,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     future_layers = get_future_layers()
     assert len(future_layers) == 1
     future_layer = future_layers[0]
-    assert isinstance(future_layer, ImageLayerFileName)
+    assert isinstance(future_layer, ImageLayerName)
     assert future_layer.lsn == last_record_lsn
     log.info(
         f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}"
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 57d3447cae..a657d5a035 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,6 +4,7 @@ from random import choice
 from string import ascii_lowercase
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
@@ -12,7 +13,6 @@ from fixtures.neon_fixtures import (
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 225622868d..83d52d4c4c 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -2,10 +2,10 @@ import re
 import time
 from datetime import datetime, timedelta, timezone
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index cb69f0ef39..ba170cfb4c 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -5,6 +5,7 @@ from typing import cast
 
 import pytest
 import requests
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -13,7 +14,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion, skip_on_postgres
-from fixtures.types import TenantId, TimelineId
 
 
 def helper_compare_timeline_list(
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index e880445c4d..98fb06a0d6 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -3,13 +3,13 @@ import os
 import time
 from pathlib import Path
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar
 
 
@@ -71,22 +71,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
 def test_import_at_2bil(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_distrib_dir: Path,
-    pg_bin,
+    pg_bin: PgBin,
     vanilla_pg,
 ):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
 
-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
     # Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
     pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
     cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)
 
     vanilla_pg.start()
     vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py
index 391305c58a..f1dd3fb67d 100644
--- a/test_runner/regress/test_old_request_lsn.py
+++ b/test_runner/regress/test_old_request_lsn.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
@@ -16,8 +16,7 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_old_request_lsn(neon_env_builder: NeonEnvBuilder):
     # Disable pitr, because here we want to test branch creation after GC
-    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
     env.neon_cli.create_branch("test_old_request_lsn", "main")
     endpoint = env.endpoints.create_start("test_old_request_lsn")
 
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 6c2556f6a2..b137fb3a5c 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -5,6 +5,7 @@ import time
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict, Tuple
 
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -21,7 +22,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
@@ -540,7 +540,6 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
 
     for layer in layers.historic_layers:
         log.info(f"pre-compact:  {layer}")
-        assert layer.layer_file_size is not None, "we must know layer file sizes"
         layer_sizes += layer.layer_file_size
         pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
 
diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py
index 0b36b32552..4af7dcdfc3 100644
--- a/test_runner/regress/test_ondemand_slru_download.py
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -1,9 +1,9 @@
 from typing import Optional
 
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index be351db429..abbea59113 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -2,19 +2,22 @@ import subprocess
 from pathlib import Path
 from typing import Optional
 
+import toml
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
-def test_pageserver_init_node_id(
-    neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path
-):
+def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path):
+    """
+    NB: The neon_local doesn't use `--init` mode anymore, but our production
+    deployment still does => https://github.com/neondatabase/aws/pull/1322
+    """
     workdir = neon_simple_env.pageserver.workdir
     pageserver_config = workdir / "pageserver.toml"
     pageserver_bin = neon_binpath / "pageserver"
@@ -28,18 +31,38 @@ def test_pageserver_init_node_id(
             stderr=subprocess.PIPE,
         )
 
-    # remove initial config and stop existing pageserver
-    pageserver_config.unlink()
     neon_simple_env.pageserver.stop()
 
-    bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'])
+    with open(neon_simple_env.pageserver.config_toml_path, "r") as f:
+        ps_config = toml.load(f)
+
+    required_config_keys = [
+        "pg_distrib_dir",
+        "listen_pg_addr",
+        "listen_http_addr",
+        "pg_auth_type",
+        "http_auth_type",
+        # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748
+        # "tenant_config",
+    ]
+    required_config_overrides = [
+        f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys
+    ]
+
+    pageserver_config.unlink()
+
+    bad_init = run_pageserver(["--init", *required_config_overrides])
     assert (
         bad_init.returncode == 1
     ), "pageserver should not be able to init new config without the node id"
     assert 'missing config value "id"' in bad_init.stderr
     assert not pageserver_config.exists(), "config file should not be created after init error"
 
-    good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
+    good_init_cmd = [
+        "--init",
+        f"--config-override=id={ps_config['id']}",
+        *required_config_overrides,
+    ]
     completed_init = run_pageserver(good_init_cmd)
     assert (
         completed_init.returncode == 0
diff --git a/test_runner/regress/test_pageserver_config.py b/test_runner/regress/test_pageserver_config.py
deleted file mode 100644
index c04348b488..0000000000
--- a/test_runner/regress/test_pageserver_config.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pytest
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    last_flush_lsn_upload,
-)
-
-
-@pytest.mark.parametrize("kind", ["sync", "async"])
-def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str):
-    neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'"
-    # ensure it starts
-    env = neon_env_builder.init_start()
-    # ensure the metric is set
-    ps_http = env.pageserver.http_client()
-    metrics = ps_http.get_metrics()
-    samples = metrics.query_all("pageserver_wal_redo_process_kind")
-    assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)]
-    # ensure default tenant's config kind matches
-    # => write some data to force-spawn walredo
-    ep = env.endpoints.create_start("main")
-    with ep.connect() as conn:
-        with conn.cursor() as cur:
-            cur.execute("create table foo(bar text)")
-            cur.execute("insert into foo select from generate_series(1, 100)")
-    last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline)
-    ep.stop()
-    ep.start()
-    with ep.connect() as conn:
-        with conn.cursor() as cur:
-            cur.execute("select count(*) from foo")
-            [(count,)] = cur.fetchall()
-            assert count == 100
-
-    status = ps_http.tenant_status(env.initial_tenant)
-    assert status["walredo"]["process"]["kind"] == kind
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index adcf7de8d4..0235cf6d20 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -16,29 +16,27 @@ import time
 from typing import Optional
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
-    NeonPageserver,
     PgBin,
     S3Scrubber,
-    flush_ep_to_pageserver,
-    last_flush_lsn_upload,
+    generate_uploads_and_deletions,
 )
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     list_prefix,
     wait_for_last_record_lsn,
     wait_for_upload,
-    wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.types import TenantId, TimelineId
-from fixtures.utils import print_gc_result, wait_until
+from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
 # A tenant configuration that is convenient for generating uploads and deletions
@@ -59,82 +57,6 @@ TENANT_CONF = {
 }
 
 
-def generate_uploads_and_deletions(
-    env: NeonEnv,
-    *,
-    init: bool = True,
-    tenant_id: Optional[TenantId] = None,
-    timeline_id: Optional[TimelineId] = None,
-    data: Optional[str] = None,
-    pageserver: NeonPageserver,
-):
-    """
-    Using the environment's default tenant + timeline, generate a load pattern
-    that results in some uploads and some deletions to remote storage.
-    """
-
-    if tenant_id is None:
-        tenant_id = env.initial_tenant
-    assert tenant_id is not None
-
-    if timeline_id is None:
-        timeline_id = env.initial_timeline
-    assert timeline_id is not None
-
-    ps_http = pageserver.http_client()
-
-    with env.endpoints.create_start(
-        "main", tenant_id=tenant_id, pageserver_id=pageserver.id
-    ) as endpoint:
-        if init:
-            endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
-            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
-            )
-
-        def churn(data):
-            endpoint.safe_psql_many(
-                [
-                    f"""
-                INSERT INTO foo (id, val)
-                SELECT g, '{data}'
-                FROM generate_series(1, 200) g
-                ON CONFLICT (id) DO UPDATE
-                SET val = EXCLUDED.val
-                """,
-                    # to ensure that GC can actually remove some layers
-                    "VACUUM foo",
-                ]
-            )
-            assert tenant_id is not None
-            assert timeline_id is not None
-            # We are waiting for uploads as well as local flush, in order to avoid leaving the system
-            # in a state where there are "future layers" in remote storage that will generate deletions
-            # after a restart.
-            last_flush_lsn_upload(
-                env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id
-            )
-
-        # Compaction should generate some GC-elegible layers
-        for i in range(0, 2):
-            churn(f"{i if data is None else data}")
-
-        gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0)
-        print_gc_result(gc_result)
-        assert gc_result["layers_removed"] > 0
-
-        # Stop endpoint and flush all data to pageserver, then checkpoint it: this
-        # ensures that the pageserver is in a fully idle state: there will be no more
-        # background ingest, no more uploads pending, and therefore no non-determinism
-        # in subsequent actions like pageserver restarts.
-        final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id)
-        ps_http.timeline_checkpoint(tenant_id, timeline_id)
-        # Finish uploads
-        wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn)
-        # Finish all remote writes (including deletions)
-        wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id)
-
-
 def read_all(
     env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None
 ):
@@ -703,7 +625,6 @@ def test_multi_attach(
     workload.validate(pageservers[2].id)
 
 
-@pytest.mark.skip(reason="To be enabled after release with new local path style")
 def test_upgrade_generationless_local_file_paths(
     neon_env_builder: NeonEnvBuilder,
 ):
@@ -712,39 +633,86 @@ def test_upgrade_generationless_local_file_paths(
     generation numbers: it should accept these layer files, and avoid doing
     a delete/download cycle on them.
     """
-    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(
+        tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}'
+    )
 
     workload = Workload(env, tenant_id, timeline_id)
     workload.init()
     workload.write_rows(1000)
 
-    env.pageserver.stop()
+    attached_pageserver = env.get_tenant_pageserver(tenant_id)
+    secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
+        0
+    ]
+
+    attached_pageserver.http_client().tenant_heatmap_upload(tenant_id)
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
 
     # Rename the local paths to legacy format, to simulate what
-    # we would see when upgrading
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
-    files_renamed = 0
-    for filename in os.listdir(timeline_dir):
-        path = os.path.join(timeline_dir, filename)
-        log.info(f"Found file {path}")
-        if path.endswith("-00000001"):
-            new_path = path[:-9]
-            os.rename(path, new_path)
-            log.info(f"Renamed {path} -> {new_path}")
-            files_renamed += 1
+    # we would see when upgrading.  Do this on both attached and secondary locations, as we will
+    # test the behavior of both.
+    for pageserver in env.pageservers:
+        pageserver.stop()
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        files_renamed = 0
+        for filename in os.listdir(timeline_dir):
+            path = os.path.join(timeline_dir, filename)
+            log.info(f"Found file {path}")
+            if path.endswith("-v1-00000001"):
+                new_path = path[:-12]
+                os.rename(path, new_path)
+                log.info(f"Renamed {path} -> {new_path}")
+                files_renamed += 1
 
-    assert files_renamed > 0
+        assert files_renamed > 0
 
-    env.pageserver.start()
+        pageserver.start()
 
     workload.validate()
 
     # Assert that there were no on-demand downloads
     assert (
-        env.pageserver.http_client().get_metric_value(
+        attached_pageserver.http_client().get_metric_value(
             "pageserver_remote_ondemand_downloaded_layers_total"
         )
         == 0
     )
+
+    # Do a secondary download and ensure there were no layer downloads
+    secondary_pageserver.http_client().tenant_secondary_download(tenant_id)
+    assert (
+        secondary_pageserver.http_client().get_metric_value(
+            "pageserver_secondary_download_layer_total"
+        )
+        == 0
+    )
+
+    # Check that when we evict and promote one of the legacy-named layers, everything works as
+    # expected
+    local_layers = list(
+        (
+            parse_layer_file_name(path.name),
+            os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path),
+        )
+        for path in attached_pageserver.list_layers(tenant_id, timeline_id)
+    )
+    (victim_layer_name, victim_path) = local_layers[0]
+    assert os.path.exists(victim_path)
+
+    attached_pageserver.http_client().evict_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    assert not os.path.exists(victim_path)
+
+    attached_pageserver.http_client().download_layer(
+        tenant_id, timeline_id, victim_layer_name.to_str()
+    )
+    # We should download into the same local path we started with
+    assert os.path.exists(victim_path)
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 42cc28efee..111285b40c 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -2,10 +2,10 @@ import json
 import uuid
 
 from anyio import Path
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 from fixtures.pg_version import PgVersion
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c5dc0f2919..66b6185aaa 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -5,6 +5,7 @@ from typing import Optional, Tuple
 
 import psutil
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -13,7 +14,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 TIMELINE_COUNT = 10
@@ -287,7 +287,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
             total_historic_bytes += sum(
                 layer.layer_file_size
                 for layer in layer_map.historic_layers
-                if layer.layer_file_size is not None and Lsn(layer.lsn_start) > initdb_lsn
+                if Lsn(layer.lsn_start) > initdb_lsn
             )
             total_ephemeral_layers += len(layer_map.in_memory_layers)
 
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index c34ef46d07..b0465f2a96 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from queue import SimpleQueue
 from typing import Any, Dict, Set
 
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -17,7 +18,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     remote_storage_to_toml_inline_table,
 )
-from fixtures.types import TenantId, TimelineId
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index c40bb962f2..25a3f8521c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -5,9 +5,10 @@ import time
 from typing import Any, Dict, Optional
 
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_prefix_empty,
     poll_for_remote_storage_iterations,
@@ -15,7 +16,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 
@@ -575,7 +575,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     tenant_timelines = {}
 
     # This mirrors a constant in `downloader.rs`
-    freshen_interval_secs = 60
+    default_download_period_secs = 60
+
+    # The upload period, which will also be the download once the secondary has seen its first heatmap
+    upload_period_secs = 30
 
     for _i in range(0, tenant_count):
         tenant_id = TenantId.generate()
@@ -587,17 +590,32 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
             placement_policy='{"Attached":1}',
             # Run with a low heatmap period so that we can avoid having to do synthetic API calls
             # to trigger the upload promptly.
-            conf={"heatmap_period": "1s"},
+            conf={"heatmap_period": f"{upload_period_secs}s"},
         )
         env.neon_cli.create_timeline("main2", tenant_id, timeline_b)
 
         tenant_timelines[tenant_id] = [timeline_a, timeline_b]
 
+    def await_log(pageserver, deadline, expression):
+        """
+        Wrapper around assert_log_contains that waits with a deadline rather than timeout
+        """
+        now = time.time()
+        if now > deadline:
+            raise RuntimeError(f"Timed out waiting for {expression}")
+        else:
+            timeout = int(deadline - now) + 1
+            try:
+                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))  # type: ignore
+            except:
+                log.error(f"Timed out waiting for '{expression}'")
+                raise
+
     t_start = time.time()
 
     # Wait long enough that the background downloads should happen; we expect all the inital layers
     # of all the initial timelines to show up on the secondary location of each tenant.
-    time.sleep(freshen_interval_secs * 1.5)
+    initial_download_deadline = time.time() + default_download_period_secs * 3
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -605,16 +623,32 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # We only have two: the other one must be secondary
         ps_secondary = next(p for p in env.pageservers if p != ps_attached)
 
+        now = time.time()
+        if now > initial_download_deadline:
+            raise RuntimeError("Timed out waiting for initial secondary download")
+        else:
+            for timeline_id in timelines:
+                log.info(
+                    f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
+                )
+                await_log(
+                    ps_secondary,
+                    initial_download_deadline,
+                    f".*{timeline_id}.*Wrote timeline_detail.*",
+                )
+
         for timeline_id in timelines:
-            log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
+            log.info(
+                f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
+            )
             # One or more layers should be present for all timelines
             assert ps_secondary.list_layers(tenant_id, timeline_id)
 
         # Delete the second timeline: this should be reflected later on the secondary
         env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])
 
-    # Wait long enough for the secondary locations to see the deletion
-    time.sleep(freshen_interval_secs * 1.5)
+    # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
+    deletion_deadline = time.time() + upload_period_secs * 3
 
     for tenant_id, timelines in tenant_timelines.items():
         attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -622,11 +656,24 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         # We only have two: the other one must be secondary
         ps_secondary = next(p for p in env.pageservers if p != ps_attached)
 
+        expect_del_timeline = timelines[1]
+        log.info(
+            f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
+        )
+        await_log(
+            ps_secondary,
+            deletion_deadline,
+            f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
+        )
+
         # This one was not deleted
         assert ps_secondary.list_layers(tenant_id, timelines[0])
 
         # This one was deleted
-        assert not ps_secondary.list_layers(tenant_id, timelines[1])
+        log.info(
+            f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
+        )
+        assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)
 
     t_end = time.time()
 
@@ -640,7 +687,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
 
     download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)
 
-    expect_download_rate = 1.0 / freshen_interval_secs
+    expect_download_rate = 1.0 / upload_period_secs
     log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")
 
     assert download_rate < expect_download_rate * 2
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 2b1b7fff34..885a94a557 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -1,16 +1,25 @@
 #
 # This file runs pg_regress-based tests.
 #
+from __future__ import annotations
+
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING, cast
 
 import pytest
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     check_restored_datadir_content,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
 
+if TYPE_CHECKING:
+    from typing import Optional
+
+    from fixtures.neon_fixtures import PgBin
+    from pytest import CaptureFixture
+
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
@@ -19,12 +28,14 @@ def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
     build_type: str,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
     """
     :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
                         many shards.
@@ -42,7 +53,7 @@ def test_pg_regress(
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"
@@ -77,7 +88,67 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        check_restored_datadir_content(test_output_dir, env, endpoint)
+        ignored_files: Optional[list[str]] = None
+
+        # Neon handles unlogged relations in a special manner. During a
+        # basebackup, we ship the init fork as the main fork. This presents a
+        # problem in that the endpoint's data directory and the basebackup will
+        # have differences and will fail the eventual file comparison.
+        #
+        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+        # support for setting the persistence of a table in 9.5. The reason that
+        # this doesn't affect versions < 15 (but probably would between 9.1 and
+        # 9.5) is that all the regression tests that deal with unlogged tables
+        # up until that point dropped the unlogged tables or set them to logged
+        # at some point during the test.
+        #
+        # In version 15, Postgres grew support for unlogged sequences, and with
+        # that came a few more regression tests. These tests did not all drop
+        # the unlogged tables/sequences prior to finishing.
+        #
+        # But unlogged sequences came with a bug in that, sequences didn't
+        # inherit the persistence of their "parent" tables if they had one. This
+        # was fixed and backported to 15, thus exacerbating our problem a bit.
+        #
+        # So what we can do is just ignore file differences between the data
+        # directory and basebackup for unlogged relations.
+        results = cast(
+            "list[tuple[str, str]]",
+            endpoint.safe_psql(
+                """
+            SELECT
+                relkind,
+                pg_relation_filepath(
+                    pg_filenode_relation(reltablespace, relfilenode)
+                ) AS unlogged_relation_paths
+            FROM pg_class
+            WHERE relpersistence = 'u'
+            """,
+                dbname=DBNAME,
+            ),
+        )
+
+        unlogged_relation_files: list[str] = []
+        for r in results:
+            unlogged_relation_files.append(r[1])
+            # This is related to the following Postgres commit:
+            #
+            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+            # Date:   2023-08-23 09:21:31 -0500
+            #
+            # Use the buffer cache when initializing an unlogged index.
+            #
+            # This patch was backpatched to 16. Without it, the LSN in the
+            # page header would be 0/0 in the data directory, which wouldn't
+            # match the LSN generated during the basebackup, thus creating
+            # a difference.
+            if env.pg_version <= PgVersion.V15 and r[0] == "i":
+                unlogged_relation_files.append(f"{r[1]}_init")
+
+        ignored_files = unlogged_relation_files
+
+        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
 
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -86,8 +157,8 @@ def test_pg_regress(
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],
@@ -142,8 +213,8 @@ def test_isolation(
 def test_sql_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: Optional[int],
diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py
index 539ef3eda7..7e676b5515 100644
--- a/test_runner/regress/test_pitr_gc.py
+++ b/test_runner/regress/test_pitr_gc.py
@@ -1,6 +1,6 @@
+from fixtures.common_types import TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.types import TimelineId
 from fixtures.utils import print_gc_result, query_scalar
 
 
@@ -10,11 +10,9 @@ from fixtures.utils import print_gc_result, query_scalar
 #
 def test_pitr_gc(neon_env_builder: NeonEnvBuilder):
     # Set pitr interval such that we need to keep the data
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={pitr_interval = '1 day', gc_horizon = 0}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"}
     )
-
-    env = neon_env_builder.init_start()
     endpoint_main = env.endpoints.create_start("main")
 
     main_pg_conn = endpoint_main.connect()
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index 6d1cb9765a..6211446a40 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -135,7 +135,14 @@ async def test_websockets_pipelined(static_proxy: NeonProxy):
         query_message = "SELECT 1".encode("utf-8") + b"\0"
         length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
         await websocket.send(
-            [length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
+            length0
+            + startup_message
+            + b"p"
+            + length1
+            + auth_message
+            + b"Q"
+            + length2
+            + query_message
         )
 
         startup_response = await websocket.recv()
diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py
index e6b3ccd7ec..cc5853b727 100644
--- a/test_runner/regress/test_read_trace.py
+++ b/test_runner/regress/test_read_trace.py
@@ -1,8 +1,8 @@
 from contextlib import closing
 
+from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index b7c8f36107..ba8b91e84d 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,8 +1,8 @@
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index ab5c8be256..e21f9bb6f6 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -10,9 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder
 #
 def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     # Override default checkpointer settings to run it more often
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "1048576",
+        }
+    )
     env.pageserver.is_testing_enabled_or_skip()
 
     # We expect the pageserver to exit, which will cause storage storage controller
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 70c025c225..7f79bf5d5c 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -6,13 +6,14 @@ import time
 from typing import Dict, List, Optional, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.pageserver.types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
@@ -25,7 +26,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     available_remote_storages,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     assert_eq,
     assert_ge,
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index 9227836862..7fdabaaec7 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -1,6 +1,7 @@
 import time
 from datetime import datetime, timezone
 
+from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
@@ -14,7 +15,6 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import Lsn
 from fixtures.utils import run_pg_bench_small
 
 
diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_s3_scrubber.py
index 018c1637d0..8981000c24 100644
--- a/test_runner/regress/test_s3_scrubber.py
+++ b/test_runner/regress/test_s3_scrubber.py
@@ -3,12 +3,12 @@ import shutil
 from typing import Optional
 
 import pytest
+from fixtures.common_types import TenantShardId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     S3Scrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
-from fixtures.types import TenantShardId
 from fixtures.workload import Workload
 
 
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index d33803250f..bbb1ad0c6d 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -5,6 +5,7 @@ from typing import Dict, List, Optional, Union
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -18,7 +19,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
-from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
@@ -632,7 +632,6 @@ def test_sharding_ingest_layer_sizes(
         historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start)
 
         for layer in historic_layers:
-            assert layer.layer_file_size is not None
             if layer.layer_file_size < expect_layer_size // 2:
                 classification = "Small"
                 small_layer_count += 1
@@ -1326,3 +1325,45 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
 
         # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
         wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+
+def test_top_tenants(neon_env_builder: NeonEnvBuilder):
+    """
+    The top_tenants API is used in shard auto-splitting to find candidates.
+    """
+
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenants = []
+    n_tenants = 8
+    for i in range(0, n_tenants):
+        tenant_id = TenantId.generate()
+        timeline_id = TimelineId.generate()
+        env.neon_cli.create_tenant(tenant_id, timeline_id)
+
+        # Write a different amount of data to each tenant
+        w = Workload(env, tenant_id, timeline_id)
+        w.init()
+        w.write_rows(i * 1000)
+        w.stop()
+
+        logical_size = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[
+            "current_logical_size"
+        ]
+        tenants.append((tenant_id, timeline_id, logical_size))
+
+        log.info(f"Created {tenant_id}/{timeline_id} with size {logical_size}")
+
+    # Ask for 1 largest tenant
+    top_1 = env.pageserver.http_client().top_tenants("max_logical_size", 1, 8, 0)
+    assert len(top_1["shards"]) == 1
+    assert top_1["shards"][0]["id"] == str(tenants[-1][0])
+    assert top_1["shards"][0]["max_logical_size"] == tenants[-1][2]
+
+    # Apply a lower bound limit
+    top = env.pageserver.http_client().top_tenants(
+        "max_logical_size", 100, 8, where_gt=tenants[3][2]
+    )
+    assert len(top["shards"]) == n_tenants - 4
+    assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:])
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index bdd356388f..3a9a522f3f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -6,6 +6,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Union
 
 import pytest
+from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -25,7 +26,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
-from fixtures.types import TenantId, TenantShardId, TimelineId
 from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until
 from fixtures.workload import Workload
 from mypy_boto3_s3.type_defs import (
@@ -1284,7 +1284,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
 
     # Apply failpoint
     env.storage_controller.configure_failpoints(
-        ("tenant-update-policy-exclusive-lock", "return(31000)")
+        ("tenant-update-policy-exclusive-lock", "return(35000)")
     )
 
     # This will hold the exclusive for enough time to cause an warning
@@ -1306,7 +1306,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.pageserver_api().timeline_create(
         pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
     )
-    thread_update_tenant_policy.join(timeout=10)
+    thread_update_tenant_policy.join()
 
     env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
     env.storage_controller.assert_log_contains(
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index fc099297e1..2cbb036c0d 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,29 +1,36 @@
 import json
 from contextlib import closing
+from typing import Any, Dict
 
 import psycopg2.extras
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.types import Lsn
 from fixtures.utils import wait_until
 
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
     """Test per tenant configuration"""
-    # set some non-default global config
-    neon_env_builder.pageserver_config_override = """
-page_cache_size=444;
-wait_lsn_timeout='111 s';
-[tenant_config]
-checkpoint_distance = 10000
-compaction_target_size = 1048576
-evictions_low_residence_duration_metric_threshold = "2 days"
-eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" }
-"""
+
+    def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]):
+        ps_cfg["page_cache_size"] = 444
+        ps_cfg["wait_lsn_timeout"] = "111 s"
+
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["checkpoint_distance"] = 10000
+        tenant_config["compaction_target_size"] = 1048576
+        tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days"
+        tenant_config["eviction_policy"] = {
+            "kind": "LayerAccessThreshold",
+            "period": "20s",
+            "threshold": "23 hours",
+        }
+
+    neon_env_builder.pageserver_config_override = set_some_nondefault_global_config
 
     env = neon_env_builder.init_start()
     # we configure eviction but no remote storage, there might be error lines
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 363c3c88ec..3fc44de6fa 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -5,6 +5,7 @@ import shutil
 from threading import Thread
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -26,7 +27,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_state,
 )
 from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import run_pg_bench_small, wait_until
 from requests.exceptions import ReadTimeout
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0ba0108651..12a4730e69 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -7,6 +7,7 @@ from typing import List, Optional
 
 import asyncpg
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -22,7 +23,6 @@ from fixtures.pageserver.utils import (
 from fixtures.remote_storage import (
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 68d9d9a660..be289e03d6 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver
 from fixtures.pageserver.http import PageserverHttpClient
@@ -20,7 +21,6 @@ from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
     query_scalar,
     wait_until,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index e73eae91f0..d3a228dbeb 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -4,6 +4,7 @@ from pathlib import Path
 from typing import List, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -19,7 +20,6 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
@@ -502,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches(
 
     gc_horizon = 128 * 1024
 
-    neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "0sec",
+            "gc_horizon": gc_horizon,
+        }
+    )
 
     # FIXME: we have a race condition between GC and delete timeline. GC might fail with this
     # error. Similar to https://github.com/neondatabase/neon/issues/2671
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index 75e5c2c91c..d08ad3cd2e 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -1,3 +1,4 @@
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.utils import (
@@ -5,7 +6,6 @@ from fixtures.pageserver.utils import (
     timeline_delete_wait_completed,
     wait_until_tenant_active,
 )
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
 
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 2832304dcc..93e9ad3673 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -9,6 +9,7 @@ from typing import List
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.metrics import (
     PAGESERVER_GLOBAL_METRICS,
@@ -24,7 +25,6 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId
 from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index a1e96928bf..168876b711 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -11,6 +11,7 @@ import os
 from pathlib import Path
 from typing import List, Tuple
 
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -18,7 +19,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     last_flush_lsn_upload,
 )
-from fixtures.pageserver.types import parse_layer_file_name
+from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
     assert_tenant_state,
     wait_for_last_record_lsn,
@@ -28,7 +29,6 @@ from fixtures.remote_storage import (
     LocalFsStorage,
     RemoteStorageKind,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 0eb1327c9e..da37f469b3 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -6,6 +6,7 @@ import threading
 
 import pytest
 import requests
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
@@ -31,7 +32,6 @@ from fixtures.remote_storage import (
     RemoteStorageKind,
     s3_storage,
 )
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, run_pg_bench_small, wait_until
 from urllib3.util.retry import Retry
 
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index b8a88ca6df..f0b2f7d733 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,18 +1,22 @@
+import datetime
 import enum
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
 from threading import Barrier
-from typing import List
+from typing import List, Tuple
 
 import pytest
+from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    PgBin,
     wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import HistoricLayerInfo
-from fixtures.pageserver.utils import wait_timeline_detail_404
-from fixtures.types import Lsn, TimelineId
+from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
+from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
+from fixtures.remote_storage import LocalFsStorage
+from fixtures.utils import assert_pageserver_backups_equal
 
 
 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
@@ -51,20 +55,25 @@ class Branchpoint(str, enum.Enum):
 SHUTDOWN_ALLOWED_ERRORS = [
     ".*initial size calculation failed: downloading failed, possibly for shutdown",
     ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+    ".*logical_size_calculation_task:panic.*: Sequential get failed with Bad state \\(not active\\).*",
+    ".*Task 'initial size calculation' .* panicked.*",
 ]
 
 
 @pytest.mark.parametrize("branchpoint", Branchpoint.all())
 @pytest.mark.parametrize("restart_after", [True, False])
+@pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
-    neon_env_builder: NeonEnvBuilder, branchpoint: Branchpoint, restart_after: bool
+    test_output_dir,
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    branchpoint: Branchpoint,
+    restart_after: bool,
+    write_to_branch_first: bool,
 ):
     """
     Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached.
     """
-    # TODO: parametrize; currently unimplemented over at pageserver
-    write_to_branch_first = True
-
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -143,6 +152,13 @@ def test_ancestor_detach_branched_from(
     else:
         branch_layers = set()
 
+    # run fullbackup to make sure there are no off by one errors
+    # take this on the parent
+    fullbackup_before = test_output_dir / "fullbackup-before.tar"
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before
+    )
+
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == set()
 
@@ -170,12 +186,23 @@ def test_ancestor_detach_branched_from(
     # but if nothing was copied, then there is no nice rule.
     # there could be a hole in LSNs between copied from the "old main" and the first branch layer.
 
+    # take this on the detached, at same lsn
+    fullbackup_after = test_output_dir / "fullbackup-after.tar"
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after
+    )
+
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
+    # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
+    # as there is always "PREV_LSN: invalid" for "before"
+    skip_files = {"zenith.signal"}
 
-@pytest.mark.parametrize("restart_after", [True, False])
-def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, restart_after: bool):
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)
+
+
+def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
     """
     The case from RFC:
 
@@ -204,9 +231,6 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
     We confirm the end result by being able to delete "old main" after deleting "after".
     """
 
-    # TODO: support not yet implemented for these
-    write_to_branch_first = True
-
     env = neon_env_builder.init_start()
 
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -244,42 +268,57 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
 
     after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None)
 
-    if write_to_branch_first:
-        with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep:
-            assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 8192
-            with ep.cursor() as cur:
-                cur.execute("UPDATE audit SET starts = starts + 1")
-                assert cur.rowcount == 1
-            wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
-
-        client.timeline_checkpoint(env.initial_tenant, timeline_id)
-
     all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
     assert all_reparented == {reparented, same_branchpoint}
 
-    if restart_after:
-        env.pageserver.stop()
-        env.pageserver.start()
-
     env.pageserver.quiesce_tenants()
 
     # checking the ancestor after is much faster than waiting for the endpoint not start
     expected_result = [
         ("main", env.initial_timeline, None, 16384, 1),
         ("after", after, env.initial_timeline, 16384, 1),
-        ("new main", timeline_id, None, 8192, 2),
+        ("new main", timeline_id, None, 8192, 1),
         ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1),
         ("reparented", reparented, timeline_id, 0, 1),
     ]
 
-    for _, timeline_id, expected_ancestor, _, _ in expected_result:
-        details = client.timeline_detail(env.initial_tenant, timeline_id)
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    for _, queried_timeline, expected_ancestor, _, _ in expected_result:
+        details = client.timeline_detail(env.initial_tenant, queried_timeline)
         ancestor_timeline_id = details["ancestor_timeline_id"]
         if expected_ancestor is None:
             assert ancestor_timeline_id is None
         else:
             assert TimelineId(ancestor_timeline_id) == expected_ancestor
 
+        index_part = env.pageserver_remote_storage.index_content(
+            env.initial_tenant, queried_timeline
+        )
+        lineage = index_part["lineage"]
+        assert lineage is not None
+
+        assert lineage.get("reparenting_history_overflown", "false") == "false"
+
+        if queried_timeline == timeline_id:
+            original_ancestor = lineage["original_ancestor"]
+            assert original_ancestor is not None
+            assert original_ancestor[0] == str(env.initial_timeline)
+            assert original_ancestor[1] == str(branchpoint_x)
+
+            # this does not contain Z in the end, so fromisoformat accepts it
+            # it is to be in line with the deletion timestamp.. well, almost.
+            when = original_ancestor[2][:26]
+            when_ts = datetime.datetime.fromisoformat(when)
+            assert when_ts < datetime.datetime.now()
+            assert len(lineage.get("reparenting_history", [])) == 0
+        elif expected_ancestor == timeline_id:
+            assert len(lineage.get("original_ancestor", [])) == 0
+            assert lineage["reparenting_history"] == [str(env.initial_timeline)]
+        else:
+            assert len(lineage.get("original_ancestor", [])) == 0
+            assert len(lineage.get("reparenting_history", [])) == 0
+
     for name, _, _, rows, starts in expected_result:
         with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep:
             assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
@@ -293,14 +332,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder, res
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
 
 
-@pytest.mark.parametrize("restart_after", [True, False])
-def test_detached_receives_flushes_while_being_detached(
-    neon_env_builder: NeonEnvBuilder, restart_after: bool
-):
+def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder):
     """
     Makes sure that the timeline is able to receive writes through-out the detach process.
     """
-    write_to_branch_first = True
 
     env = neon_env_builder.init_start()
 
@@ -330,12 +365,6 @@ def test_detached_receives_flushes_while_being_detached(
     ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant)
     assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows
 
-    if write_to_branch_first:
-        rows += insert_rows(256, ep)
-        wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id)
-        client.timeline_checkpoint(env.initial_tenant, timeline_id)
-        log.info("completed {write_to_branch_first=}")
-
     def small_txs(ep, queue: Queue[str], barrier):
         extra_rows = 0
 
@@ -368,11 +397,6 @@ def test_detached_receives_flushes_while_being_detached(
         reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
         assert len(reparented) == 0
 
-        if restart_after:
-            # ep and row production is kept alive on purpose
-            env.pageserver.stop()
-            env.pageserver.start()
-
         env.pageserver.quiesce_tenants()
 
         queue.put("done")
@@ -394,6 +418,173 @@ def test_detached_receives_flushes_while_being_detached(
     env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
 
+def test_compaction_induced_by_detaches_in_history(
+    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin
+):
+    """
+    Assuming the tree of timelines:
+
+    root
+    |- child1
+       |- ...
+          |- wanted_detached_child
+
+    Each detach can add N more L0 per level, this is actually unbounded because
+    compaction can be arbitrarily delayed (or detach happen right before one
+    starts). If "wanted_detached_child" has already made progress and compacted
+    L1s, we want to make sure "compaction in the history" does not leave the
+    timeline broken.
+    """
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            # we want to create layers manually so we don't branch on arbitrary
+            # Lsn, but we also do not want to compact L0 -> L1.
+            "compaction_threshold": "99999",
+            "compaction_period": "0s",
+            # shouldn't matter, but just in case
+            "gc_period": "0s",
+        }
+    )
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+    client = env.pageserver.http_client()
+
+    def delta_layers(timeline_id: TimelineId):
+        # shorthand for more readable formatting
+        return client.layer_map_info(env.initial_tenant, timeline_id).delta_layers()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("create table integers (i bigint not null);")
+        ep.safe_psql("insert into integers (i) values (42)")
+        branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+
+        client.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+
+        assert len(delta_layers(env.initial_timeline)) == 2
+
+    more_good_numbers = range(0, 3)
+
+    branches: List[Tuple[str, TimelineId]] = [("main", env.initial_timeline)]
+
+    for num in more_good_numbers:
+        branch_name = f"br-{len(branches)}"
+        branch_timeline_id = env.neon_cli.create_branch(
+            branch_name,
+            ancestor_branch_name=branches[-1][0],
+            tenant_id=env.initial_tenant,
+            ancestor_start_lsn=branch_lsn,
+        )
+        branches.append((branch_name, branch_timeline_id))
+
+        with env.endpoints.create_start(branches[-1][0], tenant_id=env.initial_tenant) as ep:
+            ep.safe_psql(
+                f"insert into integers (i) select i from generate_series({num}, {num + 100}) as s(i)"
+            )
+            branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+            client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+
+        assert len(delta_layers(branch_timeline_id)) == 1
+
+    # now fill in the final, most growing timeline
+
+    branch_name, branch_timeline_id = branches[-1]
+    with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep:
+        ep.safe_psql("insert into integers (i) select i from generate_series(50, 500) s(i)")
+
+        last_suffix = None
+        for suffix in range(0, 4):
+            ep.safe_psql(f"create table other_table_{suffix} as select * from integers")
+            wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+            client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+            last_suffix = suffix
+
+        assert last_suffix is not None
+
+        assert len(delta_layers(branch_timeline_id)) == 5
+
+        client.patch_tenant_config_client_side(
+            env.initial_tenant, {"compaction_threshold": 5}, None
+        )
+
+        client.timeline_compact(env.initial_tenant, branch_timeline_id)
+
+        # one more layer
+        ep.safe_psql(f"create table other_table_{last_suffix + 1} as select * from integers")
+        wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id)
+
+        # we need to wait here, because the detaches will do implicit tenant restart,
+        # and we could get unexpected layer counts
+        client.timeline_checkpoint(env.initial_tenant, branch_timeline_id, wait_until_uploaded=True)
+
+    assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1
+
+    skip_main = branches[1:]
+    branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"]
+
+    # take the fullbackup before and after inheriting the new L0s
+    fullbackup_before = test_output_dir / "fullbackup-before.tar"
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before
+    )
+
+    for _, timeline_id in skip_main:
+        reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
+        assert reparented == set(), "we have no earlier branches at any level"
+
+    post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
+    assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total"
+
+    # checkpoint does compaction, which in turn decides to run, because
+    # there is now in total threshold number L0s even if they are not
+    # adjacent in Lsn space:
+    #
+    # inherited  flushed during this checkpoint
+    #       \\\\ /
+    #       1234X5---> lsn
+    #           |
+    #       l1 layers from "fill in the final, most growing timeline"
+    #
+    # branch_lsn is between 4 and first X.
+    client.timeline_checkpoint(env.initial_tenant, branch_timeline_id)
+
+    post_compact_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id)))
+    assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted"
+
+    fullbackup_after = test_output_dir / "fullbackup_after.tar"
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
+    )
+
+    # we don't need to skip any files, because zenith.signal will be identical
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
+
+
+def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    client = env.pageserver.http_client()
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
+    assert info.value.status_code == 409
+
+    first_branch = env.neon_cli.create_branch("first_branch")
+    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
+
+    # funnily enough this does not have a prefix
+    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, second_branch)
+    assert info.value.status_code == 400
+
+    client.tenant_delete(env.initial_tenant)
+    wait_tenant_status_404(client, env.initial_tenant, 10, 1)
+
+    with pytest.raises(PageserverApiException) as e:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert e.value.status_code == 404
+
+
 # TODO:
 # - after starting the operation, tenant is deleted
 # - after starting the operation, pageserver is shutdown, restarted
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 628c484fbd..db5297870e 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -10,6 +10,7 @@ from typing import Optional
 import psycopg2.errors
 import psycopg2.extras
 import pytest
+from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
@@ -31,7 +32,6 @@ from fixtures.pageserver.utils import (
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
 from fixtures.utils import get_timeline_dir_size, wait_until
 
 
@@ -415,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
 
     # Disable background compaction as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = (
-        "tenant_config={checkpoint_distance=100000, compaction_period='10m'}"
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "10m",
+        }
     )
-
-    env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction")
@@ -462,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
 
     # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
     # and before checking the expected size on disk, which makes the assertion failed
-    neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
-
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_distance": "100000",
+            "compaction_period": "0s",
+            "gc_period": "0s",
+            "pitr_interval": "1s",
+        }
+    )
     pageserver_http = env.pageserver.http_client()
 
     new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc")
@@ -650,7 +656,7 @@ def get_physical_size_values(
     client = env.pageserver.http_client()
 
     res.layer_map_file_size_sum = sum(
-        layer.layer_file_size or 0
+        layer.layer_file_size
         for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers
     )
 
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
index 708bf0dfeb..137d28b9fa 100644
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -1,4 +1,5 @@
 from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.pg_version import PgVersion
 
 
 #
@@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv):
     cur.execute("CREATE UNLOGGED TABLE iut (id int);")
     # create index to test unlogged index relation as well
     cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
-    cur.execute("INSERT INTO iut values (42);")
+    cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;")
+    cur.execute("INSERT INTO iut (id) values (42);")
 
     # create another compute to fetch inital empty contents from pageserver
     fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
@@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv):
     conn2 = endpoint2.connect()
     cur2 = conn2.cursor()
     # after restart table should be empty but valid
-    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)")
     cur2.execute("EXECUTE iut_plan (43);")
     cur2.execute("SELECT * FROM iut")
-    assert cur2.fetchall() == [(43,)]
+    results = cur2.fetchall()
+    # Unlogged sequences were introduced in v15. On <= v14, the sequence created
+    # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps
+    # the old value (2) on restart. While on v15 and above, it's unlogged, so it
+    # gets reset to 1.
+    if env.pg_version <= PgVersion.V14:
+        assert results == [(43, 2)]
+    else:
+        assert results == [(43, 1)]
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 06f2a8befd..b549db1af6 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -168,15 +168,16 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
     # The VM page in shared buffer cache, and the same page as reconstructed
     # by the pageserver, should be equal.
     #
-    # Ignore the LSN on the page though (first 8 bytes). If the dirty
-    # VM page is flushed from the cache for some reason, it gets WAL-logged,
-    # which changes the LSN on the page.
+    # Ignore page header (24 bytes) of visibility map.
+    # If the dirty VM page is flushed from the cache for some reason,
+    # it gets WAL-logged, which changes the LSN on the page.
+    # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page.
     cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
+    vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex()
     cur.execute(
         "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
     )
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex()
 
     assert vm_page_at_pageserver == vm_page_in_cache
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 967d133e18..ea66eeff63 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -18,6 +18,7 @@ import psycopg2.errors
 import psycopg2.extras
 import pytest
 from fixtures.broker import NeonBroker
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
@@ -47,7 +48,6 @@ from fixtures.remote_storage import (
 )
 from fixtures.safekeeper.http import SafekeeperHttpClient
 from fixtures.safekeeper.utils import are_walreceivers_absent
-from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import get_dir_size, query_scalar, start_in_background
 
 
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index dce5616ac6..b5d86de574 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -8,10 +8,10 @@ from typing import List, Optional
 import asyncpg
 import pytest
 import toml
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
 
 log = getLogger("root.safekeeper_async")
 
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 7ac6e6332c..6582b34218 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,8 +1,9 @@
 import time
+from typing import Any, Dict
 
+from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
-from fixtures.types import Lsn, TenantId
 
 
 # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout.
@@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
 # Kills one of the safekeepers and ensures that only the active ones are printed in the state.
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
     # Trigger WAL wait timeout faster
-    neon_env_builder.pageserver_config_override = """
-        wait_lsn_timeout = "1s"
-        tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"}
-    """
+    def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
+        ps_cfg["wait_lsn_timeout"] = "1s"
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["walreceiver_connect_timeout"] = "2s"
+        tenant_config["lagging_wal_timeout"] = "2s"
+
+    neon_env_builder.pageserver_config_override = customize_pageserver_toml
+
     # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
     neon_env_builder.safekeepers_id_start = 12345
     neon_env_builder.num_safekeepers = 3
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 083a259d85..01a1d5cf55 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -6,6 +6,7 @@ from typing import List
 
 import pytest
 import zstandard
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -19,7 +20,6 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage
-from fixtures.types import Lsn, TenantId, TimelineId
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index 13159efbe8..ad37807dba 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -2,10 +2,10 @@ import time
 
 import psutil
 import pytest
+from fixtures.common_types import TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId
 
 
 def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False):
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index d6f7e2c604..0d30e28f74 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a
+Subproject commit 0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index f0d6b0ef75..74fb144890 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit f0d6b0ef7581bd78011832e23d8420a7d2c8a83a
+Subproject commit 74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 8ef3c33aa0..3c2b9d576c 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 8ef3c33aa01631e17cb24a122776349fcc777b46
+Subproject commit 3c2b9d576c580e0b5b7108001f959b8c5b42e0a2
diff --git a/vendor/revisions.json b/vendor/revisions.json
index c5b55762fa..2f16f334c5 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.2", "8ef3c33aa01631e17cb24a122776349fcc777b46"],
-  "v15": ["15.6", "f0d6b0ef7581bd78011832e23d8420a7d2c8a83a"],
-  "v14": ["14.11", "d6f7e2c604bfc7cbc4c46bcea0a8e800f4bc778a"]
+  "v16": ["16.3", "3c2b9d576c580e0b5b7108001f959b8c5b42e0a2"],
+  "v15": ["15.7", "74fb144890c4f955db1ef50ee1eeb9d8a6c2f69d"],
+  "v14": ["14.12", "0d30e28f74f49fe6a27a6bd45dcfeb1060656b8f"]
 }
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 41ca16f16b..0f9d56e466 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -244,6 +244,93 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
+
+      - metric_name: current_lsn
+        type: gauge
+        help: 'Current LSN of the database'
+        key_labels:
+        values: [lsn]
+        query: |
+          select
+            case
+              when pg_catalog.pg_is_in_recovery()
+              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+              else (pg_current_wal_lsn() - '0/0')::FLOAT8
+            end as lsn;
+
+      - metric_name: replication_delay_bytes
+        type: gauge
+        help: 'Bytes between received and replayed LSN'
+        key_labels:
+        values: [replication_delay_bytes]
+        query: |
+          SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;
+
+      - metric_name: replication_delay_seconds
+        type: gauge
+        help: 'Time since last LSN was replayed'
+        key_labels:
+        values: [replication_delay_seconds]
+        query: |
+          SELECT
+            CASE
+              WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+              ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+            END AS replication_delay_seconds;
+
+      - metric_name: checkpoints_req
+        type: gauge
+        help: 'Number of requested checkpoints'
+        key_labels:
+        values: [checkpoints_req]
+        query: |
+          SELECT checkpoints_req FROM pg_stat_bgwriter;
+
+      - metric_name: checkpoints_timed
+        type: gauge
+        help: 'Number of scheduled checkpoints'
+        key_labels:
+        values: [checkpoints_timed]
+        query: |
+          SELECT checkpoints_timed FROM pg_stat_bgwriter;
+
+      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
+      # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
+      - metric_name: logical_slot_restart_lsn
+        type: gauge
+        help: 'restart_lsn of logical slots'
+        key_labels:
+          - slot_name
+        values: [restart_lsn]
+        query: |
+          select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical';
+
+      - metric_name: retained_wal
+        type: gauge
+        help: 'Retained WAL in inactive replication slots'
+        key_labels:
+          - slot_name
+        values: [retained_wal]
+        query: |
+          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+          FROM pg_replication_slots
+          WHERE active = false;
+
+      - metric_name: wal_is_lost
+        type: gauge
+        help: 'Whether or not the replication slot\'s wal_status is lost'
+        key_labels:
+          - slot_name
+        values: [wal_status_is_lost]
+        query: |
+          SELECT slot_name,
+          CASE
+            WHEN wal_status = 'lost' THEN 1
+            ELSE 0
+          END AS wal_status_is_lost
+          FROM pg_replication_slots;
   - filename: neon_collector_autoscaling.yml
     content: |
       collector_name: neon_collector_autoscaling
@@ -295,7 +382,6 @@ files:
         values: [approximate_working_set_size]
         query: |
           select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
 build: |
   # Build cgroup-tools
   #
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index b2da33e44a..f364a6c2e0 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -13,13 +13,14 @@ publish = false
 
 ### BEGIN HAKARI SECTION
 [dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] }
 aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] }
 aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] }
 aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] }
 aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] }
-aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio", "test-util"] }
+aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] }
 axum = { version = "0.6", features = ["ws"] }
 base64 = { version = "0.21", features = ["alloc"] }
 base64ct = { version = "1", default-features = false, features = ["std"] }
@@ -51,7 +52,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -85,6 +86,7 @@ zstd-safe = { version = "7", default-features = false, features = ["arrays", "le
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 [build-dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
@@ -102,7 +104,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }