Merge pull request #7880 from neondatabase/rc/proxy/2024-05-24

Proxy release 2024-05-24
proxy: fix websocket buffering (#7878 )
2026-03-06 18:00:37 +00:00 · 2024-05-24 18:00:18 +01:00 · 2024-05-24 17:56:12 +01:00 · 2024-05-24 14:11:51 +01:00 · 2024-05-24 14:07:58 +01:00 · 2024-05-24 11:45:34 +03:00
85 changed files with 3228 additions and 1251 deletions
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -3,13 +3,13 @@ description: 'Create Branch using API'

 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  project_id:
-    desctiption: 'ID of the Project to create Branch in'
+    description: 'ID of the Project to create Branch in'
    required: true
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build
 outputs:
  dsn:
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'

 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  project_id:
-    desctiption: 'ID of the Project which should be deleted'
+    description: 'ID of the Project which should be deleted'
    required: true
  branch_id:
-    desctiption: 'ID of the branch to delete'
+    description: 'ID of the branch to delete'
    required: true
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build

 runs:
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'

 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  region_id:
-    desctiption: 'Region ID, if not set the project will be created in the default region'
+    description: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    desctiption: 'Postgres version; default is 15'
-    default: 15
+    description: 'Postgres version; default is 15'
+    default: '15'
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build
  provisioner:
-    desctiption: 'k8s-pod or k8s-neonvm'
+    description: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
  compute_units:
-    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'

 outputs:
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'

 inputs:
  api_key:
-    desctiption: 'Neon API key'
+    description: 'Neon API key'
    required: true
  project_id:
-    desctiption: 'ID of the Project to delete'
+    description: 'ID of the Project to delete'
    required: true
  api_host:
-    desctiption: 'Neon API host'
+    description: 'Neon API host'
    default: console-stage.neon.build

 runs:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -548,7 +548,7 @@ jobs:

  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
+    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
    runs-on: ubuntu-latest

    steps:
@@ -723,9 +723,13 @@ jobs:
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit

-  neon-image:
+  neon-image-arch:
    needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+    strategy:
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
      - name: Checkout
@@ -747,12 +751,6 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
      - uses: docker/build-push-action@v5
        with:
          context: .
@@ -764,25 +762,52 @@ jobs:
          push: true
          pull: true
          file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache
-          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom

-  compute-node-image:
-    needs: [ check-permissions, build-build-tools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
+  neon-image:
+    needs: [ neon-image-arch, tag ]
+    runs-on: ubuntu-latest

+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch image
+        run: |
+          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
+
+  compute-node-image-arch:
+    needs: [ check-permissions, build-build-tools-image, tag ]
    strategy:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
      - name: Checkout
@@ -829,15 +854,14 @@ jobs:
          push: true
          pull: true
          file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
-        if: ${{ matrix.version == 'v16' }}
+        if: matrix.version == 'v16'
        uses: docker/build-push-action@v5
        with:
          target: compute-tools-image
@@ -851,14 +875,57 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          tags: |
-            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom

+  compute-node-image:
+    needs: [ compute-node-image-arch, tag ]
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        version: [ v14, v15, v16 ]
+
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch compute-node image
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - name: Create multi-arch compute-tools image
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Push multi-arch compute-tools image to ECR
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
@@ -866,11 +933,8 @@ jobs:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.29.3

    steps:
      - name: Checkout
@@ -883,26 +947,48 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
      # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
      # it won't have the proper authentication (written at v0.6.0)
      - name: Pulling compute-node image
        run: |
-          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}

      - name: Build vm image
        run: |
          ./vm-builder \
            -spec=vm-image-spec.yaml \
-            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}

      - name: Pushing vm-compute-node image
        run: |
-          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom

  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+
+    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}

    steps:
      - name: Checkout
@@ -920,7 +1006,7 @@ jobs:
      - name: Verify image versions
        shell: bash # ensure no set -e for better error messages
        run: |
-          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+          pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")

          echo "Pageserver version string: $pageserver_version"

@@ -946,78 +1032,48 @@ jobs:

  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-    # Don't add if-condition here.
-    # The job should always be run because we have dependant other jobs that shouldn't be skipped
+    runs-on: ubuntu-latest
+
+    env:
+      VERSIONS: v14 v15 v16

    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Copy vm-compute-node images to Docker Hub
+      - name: Copy vm-compute-node images to ECR
        run: |
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
-          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16
+          for version in ${VERSIONS}; do
+            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
+                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+          done

      - name: Add latest tag to images
-        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'main'
        run: |
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
+            docker buildx imagetools create -t $repo/neon:latest \
+                                               $repo/neon:${{ needs.tag.outputs.build-tag }}

-      - name: Push images to production ECR
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest
+            docker buildx imagetools create -t $repo/compute-tools:latest \
+                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}

-      - name: Configure Docker Hub login
-        run: |
-          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
-          echo "" > /github/home/.docker/config.json
-          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
+            for version in ${VERSIONS}; do
+              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
+                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}

-      - name: Push vm-compute-node to Docker Hub
-        run: |
-          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
-          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
-
-      - name: Push latest tags to Docker Hub
-        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
-        run: |
-          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
-
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
+                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+            done
+          done

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,7 +53,7 @@ jobs:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Release ${RELEASE_DATE}
+          ## Storage & Compute release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1471,26 +1471,21 @@ dependencies = [

 [[package]]
 name = "crossbeam-deque"
-version = "0.8.3"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
+checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 dependencies = [
- "cfg-if",
 "crossbeam-epoch",
 "crossbeam-utils",
 ]

 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.14"
+version = "0.9.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 dependencies = [
- "autocfg",
- "cfg-if",
 "crossbeam-utils",
- "memoffset 0.8.0",
- "scopeguard",
 ]

 [[package]]
@@ -3961,9 +3956,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"

 [[package]]
 name = "pbkdf2"
-version = "0.12.1"
+version = "0.12.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
+checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
 dependencies = [
 "digest",
 "hmac",
@@ -4386,6 +4381,7 @@ dependencies = [
 name = "proxy"
 version = "0.1.0"
 dependencies = [
+ "ahash",
 "anyhow",
 "async-compression",
 "async-trait",
@@ -4402,6 +4398,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "crossbeam-deque",
 "dashmap",
 "env_logger",
 "fallible-iterator",
@@ -7473,6 +7470,7 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
+ "ahash",
 "anyhow",
 "aws-config",
 "aws-runtime",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ license = "Apache-2.0"

 ## All dependency versions, used in the project
 [workspace.dependencies]
+ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -74,6 +75,7 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
+crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -243,12 +243,15 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/pgvector.patch /pgvector.patch

+# By default, pgvector Makefile uses `-march=native`. We don't want that, 
+# because we build the images on different machines than where we run them.
+# Pass OPTFLAGS="" to remove it.
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

 #########################################################################################
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -1,4 +1,4 @@
-ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG REPOSITORY=neondatabase
 ARG COMPUTE_IMAGE=compute-node-v14
 ARG TAG=latest

--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -8,8 +8,6 @@
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).

-# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
-
 set -eux -o pipefail

 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,6 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -306,7 +305,7 @@ pub struct TenantConfig {
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -334,14 +333,28 @@ pub struct TenantConfig {
 /// Unset -> V1
 ///       -> V2
 ///       -> CrossValidation -> V2
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum AuxFilePolicy {
    /// V1 aux file policy: store everything in AUX_FILE_KEY
+    #[strum(ascii_case_insensitive)]
    V1,
    /// V2 aux file policy: store in the AUX_FILE keyspace
+    #[strum(ascii_case_insensitive)]
    V2,
    /// Cross validation runs both formats on the write path and does validation
    /// on the read path.
+    #[strum(ascii_case_insensitive)]
    CrossValidation,
 }

@@ -407,23 +420,6 @@ impl AuxFilePolicy {
    }
 }

-impl FromStr for AuxFilePolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let s = s.to_lowercase();
-        if s == "v1" {
-            Ok(Self::V1)
-        } else if s == "v2" {
-            Ok(Self::V2)
-        } else if s == "crossvalidation" || s == "cross_validation" {
-            Ok(Self::CrossValidation)
-        } else {
-            anyhow::bail!("cannot parse {} to aux file policy", s)
-        }
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -442,13 +438,28 @@ impl EvictionPolicy {
    }
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
 pub enum CompactionAlgorithm {
    Legacy,
    Tiered,
 }

+#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
+pub struct CompactionAlgorithmSettings {
+    pub kind: CompactionAlgorithm,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -1405,6 +1416,7 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
    use serde_json::json;
+    use std::str::FromStr;

    use super::*;

@@ -1667,4 +1679,14 @@ mod tests {
            AuxFilePolicy::V2
        ));
    }
+
+    #[test]
+    fn test_aux_parse() {
+        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
+        assert_eq!(
+            AuxFilePolicy::from_str("cross-validation").unwrap(),
+            AuxFilePolicy::CrossValidation
+        );
+    }
 }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -178,6 +178,13 @@ impl PgConnectionConfig {
    }
 }

+impl fmt::Display for PgConnectionConfig {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // The password is intentionally hidden and not part of this display string.
+        write!(f, "postgresql://{}:{}", self.host, self.port)
+    }
+}
+
 impl fmt::Debug for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,7 +29,6 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

-use crate::RemoteStorageActivity;
 use crate::{
    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -526,10 +525,6 @@ impl RemoteStorage for AzureBlobStorage {
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
        Err(TimeTravelError::Unimplemented)
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,17 +263,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
        done_if_after: SystemTime,
        cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError>;
-
-    /// Query how busy we currently are: may be used by callers which wish to politely
-    /// back off if there are already a lot of operations underway.
-    fn activity(&self) -> RemoteStorageActivity;
-}
-
-pub struct RemoteStorageActivity {
-    pub read_available: usize,
-    pub read_total: usize,
-    pub write_available: usize,
-    pub write_total: usize,
 }

 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -455,15 +444,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
            }
        }
    }
-
-    pub fn activity(&self) -> RemoteStorageActivity {
-        match self {
-            Self::LocalFs(s) => s.activity(),
-            Self::AwsS3(s) => s.activity(),
-            Self::AzureBlob(s) => s.activity(),
-            Self::Unreliable(s) => s.activity(),
-        }
-    }
 }

 impl GenericRemoteStorage {
@@ -794,9 +774,6 @@ struct ConcurrencyLimiter {
    // The helps to ensure we don't exceed the thresholds.
    write: Arc<Semaphore>,
    read: Arc<Semaphore>,
-
-    write_total: usize,
-    read_total: usize,
 }

 impl ConcurrencyLimiter {
@@ -825,21 +802,10 @@ impl ConcurrencyLimiter {
        Arc::clone(self.for_kind(kind)).acquire_owned().await
    }

-    fn activity(&self) -> RemoteStorageActivity {
-        RemoteStorageActivity {
-            read_available: self.read.available_permits(),
-            read_total: self.read_total,
-            write_available: self.write.available_permits(),
-            write_total: self.write_total,
-        }
-    }
-
    fn new(limit: usize) -> ConcurrencyLimiter {
        Self {
            read: Arc::new(Semaphore::new(limit)),
            write: Arc::new(Semaphore::new(limit)),
-            read_total: limit,
-            write_total: limit,
        }
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;

 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
-    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -605,16 +605,6 @@ impl RemoteStorage for LocalFs {
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
-        RemoteStorageActivity {
-            read_available: 16,
-            read_total: 16,
-            write_available: 16,
-            write_total: 16,
-        }
-    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
-    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -975,10 +975,6 @@ impl RemoteStorage for S3Bucket {
        }
        Ok(())
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.concurrency_limiter.activity()
-    }
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;

 use crate::{
    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    RemoteStorageActivity, StorageMetadata, TimeTravelError,
+    StorageMetadata, TimeTravelError,
 };

 pub struct UnreliableWrapper {
@@ -213,8 +213,4 @@ impl RemoteStorage for UnreliableWrapper {
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
    }
-
-    fn activity(&self) -> RemoteStorageActivity {
-        self.inner.activity()
-    }
 }
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,7 +135,8 @@ impl Gate {
        let started_at = std::time::Instant::now();
        let mut do_close = std::pin::pin!(self.do_close());

-        let nag_after = Duration::from_secs(1);
+        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
+        let nag_after = Duration::from_millis(100);

        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
            return;
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
    }
    fn file_size(&self) -> u64 {
        match self {
-            MockLayer::Delta(this) => this.file_size(),
-            MockLayer::Image(this) => this.file_size(),
+            MockLayer::Delta(this) => this.file_size,
+            MockLayer::Image(this) => this.file_size,
        }
    }
    fn short_id(&self) -> String {
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;

 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            #[derive(serde::Serialize)]
            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
                disk_consistent_lsn: Lsn,
                timeline_metadata: &'a TimelineMetadata,
            }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    });
                }
                EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size();
+                    let file_size = layer.metadata.file_size;

                    js.spawn(async move {
                        layer
@@ -641,7 +641,7 @@ impl EvictionLayer {
    pub(crate) fn get_file_size(&self) -> u64 {
        match self {
            Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size(),
+            Self::Secondary(sl) => sl.metadata.file_size,
        }
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -612,6 +612,80 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        ŕequired: true
+        schema:
+          type: string
+
+    put:
+      description: |
+        Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
+        Current implementation might not be retryable across failure cases, but will be enhanced in future.
+        Detaching should be expected to be expensive operation. Timeouts should be retried.
+      responses:
+        "200":
+          description: |
+            The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
+            If any timelines were deleted after reparenting, they might not be on this list.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AncestorDetached"
+
+        "400":
+          description: |
+            Number of early checks meaning the timeline cannot be detached now:
+              - the ancestor of timeline has an ancestor: not supported, see RFC
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "404":
+          description: Tenant or timeline not found.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+
+        "409":
+          description: |
+            The timeline can never be detached:
+              - timeline has no ancestor, implying that the timeline has never had an ancestor
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+
+        "500":
+          description: |
+            Transient error, for example, pageserver shutdown happened while
+            processing the request but we were unable to distinguish that. Must
+            be retried.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+        "503":
+          description: |
+            Temporarily unavailable, please retry. Possible reasons:
+              - another timeline detach for the same tenant is underway, please retry later
+              - detected shutdown error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
+
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -1077,6 +1151,19 @@ components:
          format: int64
          description: How many bytes of layer content were in the latest layer heatmap

+    AncestorDetached:
+      type: object
+      required:
+        - reparented_timelines
+      properties:
+        reparented_timelines:
+          type: array
+          description: Set of reparented timeline ids
+          properties:
+            type: string
+            format: hex
+            description: TimelineId
+

    Error:
      type: object
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,6 +16,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
@@ -74,6 +75,7 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
+use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -279,6 +281,13 @@ impl From<GetTenantError> for ApiError {
    }
 }

+impl From<GetTimelineError> for ApiError {
+    fn from(gte: GetTimelineError) -> Self {
+        // Rationale: tenant is activated only after eligble timelines activate
+        ApiError::NotFound(gte.into())
+    }
+}
+
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
@@ -386,7 +395,7 @@ async fn build_timeline_info_common(
        let guard = timeline.last_received_wal.lock().unwrap();
        if let Some(info) = guard.as_ref() {
            (
-                Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
+                Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
                Some(info.last_received_msg_lsn),
                Some(info.last_received_msg_ts),
            )
@@ -643,9 +652,7 @@ async fn timeline_preserve_initdb_handler(
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;

-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;

        timeline
            .preserve_initdb_archive()
@@ -687,9 +694,7 @@ async fn timeline_detail_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        let timeline = tenant
-            .get_timeline(timeline_id, false)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, false)?;

        let timeline_info = build_timeline_info(
            &timeline,
@@ -1901,14 +1906,11 @@ async fn timeline_detach_ancestor_handler(
        let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
        let ctx = &ctx;

-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;

        let (_guard, prepared) = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            .await?;

        let res = state
            .tenant_manager
@@ -2042,9 +2044,7 @@ async fn active_timeline_of_active_tenant(

    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-    tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| ApiError::NotFound(e.into()))
+    Ok(tenant.get_timeline(timeline_id, true)?)
 }

 async fn always_panic_handler(
@@ -2308,6 +2308,31 @@ async fn post_tracing_event_handler(
    json_response(StatusCode::OK, ())
 }

+async fn force_aux_policy_switch_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
+    let policy: AuxFilePolicy = json_request(&mut r).await?;
+
+    let state = get_state(&r);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+    timeline
+        .do_switch_aux_policy(policy)
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn put_io_engine_handler(
    mut r: Request<Body>,
    _cancel: CancellationToken,
@@ -2385,19 +2410,9 @@ async fn list_aux_files(
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;

-    let process = || async move {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let files = timeline.list_aux_files(body.lsn, &ctx).await?;
-        Ok::<_, anyhow::Error>(files)
-    };
-
-    match process().await {
-        Ok(st) => json_response(StatusCode::OK, st),
-        Err(err) => json_response(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            ApiError::InternalServerError(err).to_string(),
-        ),
-    }
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+    json_response(StatusCode::OK, files)
 }

 async fn ingest_aux_files(
@@ -2415,24 +2430,22 @@ async fn ingest_aux_files(
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;

-    let process = || async move {
-        let mut modification = timeline.begin_modification(Lsn(
-            timeline.get_last_record_lsn().0 + 8
-        ) /* advance LSN by 8 */);
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        for (fname, content) in body.aux_files {
-            modification
-                .put_file(&fname, content.as_bytes(), &ctx)
-                .await?;
-        }
-        modification.commit(&ctx).await?;
-        Ok::<_, anyhow::Error>(())
-    };
-
-    match process().await {
-        Ok(st) => json_response(StatusCode::OK, st),
-        Err(err) => Err(ApiError::InternalServerError(err)),
+    let mut modification = timeline.begin_modification(
+        Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
+    );
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    for (fname, content) in body.aux_files {
+        modification
+            .put_file(&fname, content.as_bytes(), &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
    }
+    modification
+        .commit(&ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
 }

 /// Report on the largest tenants on this pageserver, for the storage controller to identify
@@ -2815,6 +2828,10 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
+            |r| api_handler(r, force_aux_policy_switch_handler),
+        )
        .get("/v1/utilization", |r| api_handler(r, get_utilization))
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -260,6 +260,8 @@ async fn page_service_conn_main(
    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

+    fail::fail_point!("ps::connection-start::pre-login");
+
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -603,6 +605,7 @@ impl PageServerHandler {
            };

            trace!("query: {copy_data_bytes:?}");
+            fail::fail_point!("ps::handle-pagerequest-message");

            // Trace request if needed
            if let Some(t) = tracer.as_mut() {
@@ -617,6 +620,7 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -626,6 +630,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -635,6 +640,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
@@ -645,6 +651,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -654,6 +661,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
+                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -1505,6 +1513,7 @@ where
        _pgb: &mut PostgresBackend<IO>,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
+        fail::fail_point!("ps::connection-start::startup-packet");
        Ok(())
    }

@@ -1519,6 +1528,8 @@ where
            Err(QueryError::SimulatedConnectionError)
        });

+        fail::fail_point!("ps::connection-start::process-query");
+
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
        let parts = query_string.split_whitespace().collect::<Vec<_>>();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1480,11 +1480,24 @@ impl<'a> DatadirModification<'a> {
            // Allowed switch path:
            // * no aux files -> v1/v2/cross-validation
            // * cross-validation->v2
+
+            let current_policy = if current_policy.is_none() {
+                // This path will only be hit once per tenant: we will decide the final policy in this code block.
+                // The next call to `put_file` will always have `last_aux_file_policy != None`.
+                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
+                if aux_files_key_v1.is_empty() {
+                    None
+                } else {
+                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
+                    Some(AuxFilePolicy::V1)
+                }
+            } else {
+                current_policy
+            };
+
            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.last_aux_file_policy.store(Some(switch_policy));
-                self.tline
-                    .remote_client
-                    .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
+                self.tline.do_switch_aux_policy(switch_policy)?;
                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
                switch_policy
            } else {
@@ -1775,6 +1788,12 @@ impl<'a> DatadirModification<'a> {
        self.tline.get(key, lsn, ctx).await
    }

+    /// Only used during unit tests, force putting a key into the modification.
+    #[cfg(test)]
+    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
+        self.put(key, val);
+    }
+
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3964,18 +3964,20 @@ mod tests {

    use super::*;
    use crate::keyspace::KeySpaceAccum;
+    use crate::pgdatadir_mapping::AuxFilesDirectory;
    use crate::repository::{Key, Value};
    use crate::tenant::harness::*;
    use crate::tenant::timeline::CompactFlags;
    use crate::DEFAULT_PG_VERSION;
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
+    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    use rand::{thread_rng, Rng};
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
+    use utils::bin_ser::BeSer;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -5167,7 +5169,9 @@ mod tests {
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5524,7 +5528,9 @@ mod tests {
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
+            kind: compaction_algorithm,
+        };
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -5997,6 +6003,130 @@ mod tests {
        );
    }

+    #[tokio::test]
+    async fn aux_file_policy_force_switch() {
+        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V2),
+            "dirty index_part.json reflected state is yet to be updated"
+        );
+
+        // lose all data from v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test2", b"second", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        // read data ingested in v2
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test2"),
+            Some(&bytes::Bytes::from_static(b"second"))
+        );
+        // lose all data from v1
+        assert_eq!(files.get("pg_logical/mappings/test1"), None);
+    }
+
+    #[tokio::test]
+    async fn aux_file_policy_auto_detect() {
+        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
+        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
+        let (tenant, ctx) = harness.load().await;
+
+        let mut lsn = Lsn(0x08);
+
+        let tline: Arc<Timeline> = tenant
+            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            None,
+            "no aux file is written so it should be unset"
+        );
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+                files: vec![(
+                    "test_file".to_string(),
+                    Bytes::copy_from_slice(b"test_file"),
+                )]
+                .into_iter()
+                .collect(),
+            })
+            .unwrap();
+            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        {
+            lsn += 8;
+            let mut modification = tline.begin_modification(lsn);
+            modification
+                .put_file("pg_logical/mappings/test1", b"first", &ctx)
+                .await
+                .unwrap();
+            modification.commit(&ctx).await.unwrap();
+        }
+
+        assert_eq!(
+            tline.last_aux_file_policy.load(),
+            Some(AuxFilePolicy::V1),
+            "keep using v1 because there are aux files writting with v1"
+        );
+
+        // we can still read the auxfile v1
+        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
+        assert_eq!(
+            files.get("pg_logical/mappings/test1"),
+            Some(&bytes::Bytes::from_static(b"first"))
+        );
+        assert_eq!(
+            files.get("test_file"),
+            Some(&bytes::Bytes::from_static(b"test_file"))
+        );
+    }
+
    #[tokio::test]
    async fn test_metadata_image_creation() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_metadata_image_creation")?;
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,10 +238,13 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        io_buf,
                        Err(Error::new(
                            ErrorKind::Other,
-                            format!("blob too large ({} bytes)", len),
+                            format!("blob too large ({len} bytes)"),
                        )),
                    );
                }
+                if len > 0x0fff_ffff {
+                    tracing::warn!("writing blob above future limit ({len} bytes)");
+                }
                let mut len_buf = (len as u32).to_be_bytes();
                len_buf[0] |= 0x80;
                io_buf.extend_from_slice(&len_buf[..]);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,6 +11,7 @@
 use anyhow::bail;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
+use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -320,7 +321,7 @@ pub struct TenantConf {
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithm,
+    pub compaction_algorithm: CompactionAlgorithmSettings,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -406,7 +407,7 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithm>,
+    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,

    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
@@ -497,7 +498,9 @@ impl TenantConfOpt {
                .unwrap_or(global_conf.compaction_threshold),
            compaction_algorithm: self
                .compaction_algorithm
-                .unwrap_or(global_conf.compaction_algorithm),
+                .as_ref()
+                .unwrap_or(&global_conf.compaction_algorithm)
+                .clone(),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -550,7 +553,9 @@ impl Default for TenantConf {
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
+            compaction_algorithm: CompactionAlgorithmSettings {
+                kind: DEFAULT_COMPACTION_ALGORITHM,
+            },
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1192,7 +1192,7 @@ impl RemoteTimelineClient {
                    &self.storage_impl,
                    uploaded.local_path(),
                    &remote_path,
-                    uploaded.metadata().file_size(),
+                    uploaded.metadata().file_size,
                    cancel,
                )
                .await
@@ -1573,7 +1573,7 @@ impl RemoteTimelineClient {
                        &self.storage_impl,
                        local_path,
                        &remote_path,
-                        layer_metadata.file_size(),
+                        layer_metadata.file_size,
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1768,7 +1768,7 @@ impl RemoteTimelineClient {
            UploadOp::UploadLayer(_, m) => (
                RemoteOpFileKind::Layer,
                RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
            ),
            UploadOp::UploadMetadata(_, _) => (
                RemoteOpFileKind::Index,
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -84,7 +84,7 @@ pub async fn download_layer_file<'a>(
    )
    .await?;

-    let expected = layer_metadata.file_size();
+    let expected = layer_metadata.file_size;
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -17,46 +17,6 @@ use pageserver_api::shard::ShardIndex;

 use utils::lsn::Lsn;

-/// Metadata gathered for each of the layer files.
-///
-/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
-/// might have less or more metadata depending if upgrading or rolling back an upgrade.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
-//#[cfg_attr(test, derive(Default))]
-pub struct LayerFileMetadata {
-    file_size: u64,
-
-    pub(crate) generation: Generation,
-
-    pub(crate) shard: ShardIndex,
-}
-
-impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
-    fn from(other: &IndexLayerMetadata) -> Self {
-        LayerFileMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
-        }
-    }
-}
-
-impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-            shard,
-        }
-    }
-
-    pub fn file_size(&self) -> u64 {
-        self.file_size
-    }
-}
-
-// TODO seems like another part of the remote storage file format
-// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -77,7 +37,7 @@ pub struct IndexPart {
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
-    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
@@ -127,10 +87,7 @@ impl IndexPart {
        lineage: Lineage,
        last_aux_file_policy: Option<AuxFilePolicy>,
    ) -> Self {
-        let layer_metadata = layers_and_metadata
-            .iter()
-            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
-            .collect();
+        let layer_metadata = layers_and_metadata.clone();

        Self {
            version: Self::LATEST_VERSION,
@@ -194,9 +151,12 @@ impl From<&UploadQueueInitialized> for IndexPart {
    }
 }

-/// Serialized form of [`LayerFileMetadata`].
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
-pub struct IndexLayerMetadata {
+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct LayerFileMetadata {
    pub file_size: u64,

    #[serde(default = "Generation::none")]
@@ -208,12 +168,12 @@ pub struct IndexLayerMetadata {
    pub shard: ShardIndex,
 }

-impl From<&LayerFileMetadata> for IndexLayerMetadata {
-    fn from(other: &LayerFileMetadata) -> Self {
-        IndexLayerMetadata {
-            file_size: other.file_size,
-            generation: other.generation,
-            shard: other.shard,
+impl LayerFileMetadata {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+            shard,
        }
    }
 }
@@ -307,12 +267,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -349,12 +309,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -392,12 +352,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -480,12 +440,12 @@ mod tests {
        let expected = IndexPart {
            version: 4,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -522,12 +482,12 @@ mod tests {
        let expected = IndexPart {
            version: 5,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
                    file_size: 23289856,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
                    file_size: 1015808,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
@@ -569,12 +529,12 @@ mod tests {
        let expected = IndexPart {
            version: 6,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{

 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::{Future, StreamExt};
+use futures::Future;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage};

 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -67,12 +67,6 @@ use super::{
 /// download, if the uploader populated it.
 const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);

-/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
-/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
-/// `PageServerConf::secondary_download_concurrency`
-const MAX_LAYER_CONCURRENCY: usize = 16;
-const MIN_LAYER_CONCURRENCY: usize = 1;
-
 pub(super) async fn downloader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
@@ -81,19 +75,18 @@ pub(super) async fn downloader_task(
    cancel: CancellationToken,
    root_ctx: RequestContext,
 ) {
-    // How many tenants' secondary download operations we will run concurrently
-    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;

    let generator = SecondaryDownloader {
        tenant_manager,
        remote_storage,
        root_ctx,
    };
-    let mut scheduler = Scheduler::new(generator, tenant_concurrency);
+    let mut scheduler = Scheduler::new(generator, concurrency);

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_downloads"))
+        .instrument(info_span!("secondary_download_scheduler"))
        .await
 }

@@ -414,7 +407,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
                }
                Err(UpdateError::Cancelled) => {
-                    tracing::debug!("Shut down while downloading");
+                    tracing::info!("Shut down while downloading");
                },
                Err(UpdateError::Deserialize(e)) => {
                    tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -716,7 +709,7 @@ impl<'a> TenantDownloader<'a> {
                let mut layer_byte_count: u64 = timeline_state
                    .on_disk_layers
                    .values()
-                    .map(|l| l.metadata.file_size())
+                    .map(|l| l.metadata.file_size)
                    .sum();

                // Remove on-disk layers that are no longer present in heatmap
@@ -727,7 +720,7 @@ impl<'a> TenantDownloader<'a> {
                        .get(layer_file_name)
                        .unwrap()
                        .metadata
-                        .file_size();
+                        .file_size;

                    let local_path = local_layer_path(
                        self.conf,
@@ -848,8 +841,6 @@ impl<'a> TenantDownloader<'a> {

        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());

-        let mut download_futs = Vec::new();
-
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
@@ -886,9 +877,7 @@ impl<'a> TenantDownloader<'a> {
                    }
                }

-                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
-                    || on_disk.access_time != layer.access_time
-                {
+                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
                    // We already have this layer on disk.  Update its access time.
                    tracing::debug!(
                        "Access time updated for layer {}: {} -> {}",
@@ -924,31 +913,14 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            download_futs.push(self.download_layer(
-                tenant_shard_id,
-                &timeline.timeline_id,
-                layer,
-                ctx,
-            ));
-        }
-
-        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
-        // concurrency to use based on activity level of remote storage.
-        while !download_futs.is_empty() {
-            let chunk =
-                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
-
-            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
-
-            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
-            let mut result_stream = std::pin::pin!(result_stream);
-            while let Some(result) = result_stream.next().await {
-                match result {
-                    Err(e) => return Err(e),
-                    Ok(None) => {
-                        // No error, but we didn't download the layer.  Don't mark it touched
-                    }
-                    Ok(Some(layer)) => touched.push(layer),
+            match self
+                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .await?
+            {
+                Some(layer) => touched.push(layer),
+                None => {
+                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
+                    // things to consider touched.
                }
            }
        }
@@ -979,7 +951,7 @@ impl<'a> TenantDownloader<'a> {
                            tenant_shard_id,
                            &timeline.timeline_id,
                            t.name,
-                            LayerFileMetadata::from(&t.metadata),
+                            t.metadata.clone(),
                            t.access_time,
                            local_path,
                        ));
@@ -1013,13 +985,18 @@ impl<'a> TenantDownloader<'a> {
        );

        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+        tracing::info!(
+            "Starting download of layer {}, size {}",
+            layer.name,
+            layer.metadata.file_size
+        );
        let downloaded_bytes = match download_layer_file(
            self.conf,
            self.remote_storage,
            *tenant_shard_id,
            *timeline_id,
            &layer.name,
-            &LayerFileMetadata::from(&layer.metadata),
+            &layer.metadata,
            &local_path,
            &self.secondary_state.cancel,
            ctx,
@@ -1078,19 +1055,6 @@ impl<'a> TenantDownloader<'a> {

        Ok(Some(layer))
    }
-
-    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
-    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
-        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
-        // of our concurrency range to the units available within the remaining 25%.
-        let clamp_at = (activity.read_total * 3) / 4;
-        if activity.read_available > clamp_at {
-            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
-                / (activity.read_total - clamp_at)
-        } else {
-            MIN_LAYER_CONCURRENCY
-        }
-    }
 }

 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1180,7 +1144,7 @@ async fn init_timeline_state(
                                    tenant_shard_id,
                                    &heatmap.timeline_id,
                                    name,
-                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.metadata.clone(),
                                    remote_meta.access_time,
                                    file_path,
                                ),
@@ -1214,58 +1178,3 @@ async fn init_timeline_state(

    detail
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn layer_concurrency() {
-        // Totally idle
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 16,
-                read_total: 16,
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY
-        );
-
-        // Totally busy
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 0,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Edge of the range at which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 12,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MIN_LAYER_CONCURRENCY
-        );
-
-        // Midpoint of the range in which we interpolate
-        assert_eq!(
-            TenantDownloader::layer_concurrency(RemoteStorageActivity {
-                read_available: 14,
-                read_total: 16,
-
-                write_available: 16,
-                write_total: 16
-            }),
-            MAX_LAYER_CONCURRENCY / 2
-        );
-    }
-}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,6 +1,6 @@
 use std::time::SystemTime;

-use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};
+use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};

 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
    pub(super) name: LayerName,
-    pub(super) metadata: IndexLayerMetadata,
+    pub(super) metadata: LayerFileMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
 impl HeatMapLayer {
    pub(crate) fn new(
        name: LayerName,
-        metadata: IndexLayerMetadata,
+        metadata: LayerFileMetadata,
        access_time: SystemTime,
    ) -> Self {
        Self {
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_uploader"))
+        .instrument(info_span!("heatmap_upload_scheduler"))
        .await
 }

--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -179,6 +179,13 @@ where
            // Schedule some work, if concurrency limit permits it
            self.spawn_pending();

+            // This message is printed every scheduling iteration as proof of liveness when looking at logs
+            tracing::info!(
+                "Status: {} tasks running, {} pending",
+                self.running.len(),
+                self.pending.len()
+            );
+
            // Between scheduling iterations, we will:
            //  - Drain any complete tasks and spawn pending tasks
            //  - Handle incoming administrative commands
@@ -258,7 +265,11 @@ where

        self.tasks.spawn(fut);

-        self.running.insert(tenant_shard_id, in_progress);
+        let replaced = self.running.insert(tenant_shard_id, in_progress);
+        debug_assert!(replaced.is_none());
+        if replaced.is_some() {
+            tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
+        }
    }

    /// For all pending tenants that are elegible for execution, spawn their task.
@@ -268,7 +279,9 @@ where
        while !self.pending.is_empty() && self.running.len() < self.concurrency {
            // unwrap: loop condition includes !is_empty()
            let pending = self.pending.pop_front().unwrap();
-            self.do_spawn(pending);
+            if !self.running.contains_key(pending.get_tenant_shard_id()) {
+                self.do_spawn(pending);
+            }
        }
    }

@@ -321,7 +334,8 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!("Command already running, waiting for it");
+            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                           "Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -47,7 +47,7 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -473,7 +473,7 @@ impl ImageLayerInner {
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        let reads = self
-            .plan_reads(keyspace, ctx)
+            .plan_reads(keyspace, None, ctx)
            .await
            .map_err(GetVectoredError::Other)?;

@@ -485,9 +485,15 @@ impl ImageLayerInner {
        Ok(())
    }

+    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
+    /// and the keys in this layer.
+    ///
+    /// If shard_identity is provided, it will be used to filter keys down to those stored on
+    /// this shard.
    async fn plan_reads(
        &self,
        keyspace: KeySpace,
+        shard_identity: Option<&ShardIdentity>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<VectoredRead>> {
        let mut planner = VectoredReadPlanner::new(
@@ -507,7 +513,6 @@ impl ImageLayerInner {

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;
-
            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

@@ -520,12 +525,22 @@ impl ImageLayerInner {
                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                assert!(key >= range.start);

+                let flag = if let Some(shard_identity) = shard_identity {
+                    if shard_identity.is_key_disposable(&key) {
+                        BlobFlag::Ignore
+                    } else {
+                        BlobFlag::None
+                    }
+                } else {
+                    BlobFlag::None
+                };
+
                if key >= range.end {
                    planner.handle_range_end(offset);
                    range_end_handled = true;
                    break;
                } else {
-                    planner.handle(key, self.lsn, offset, BlobFlag::None);
+                    planner.handle(key, self.lsn, offset, flag);
                }
            }

@@ -538,6 +553,50 @@ impl ImageLayerInner {
        Ok(planner.finish())
    }

+    /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
+    /// then execute vectored GET operations, passing the results of all read keys into the writer.
+    pub(super) async fn filter(
+        &self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        // Fragment the range into the regions owned by this ShardIdentity
+        let plan = self
+            .plan_reads(
+                KeySpace {
+                    // If asked for the total key space, plan_reads will give us all the keys in the layer
+                    ranges: vec![Key::MIN..Key::MAX],
+                },
+                Some(shard_identity),
+                ctx,
+            )
+            .await?;
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut key_count = 0;
+        for read in plan.into_iter() {
+            let buf_size = read.size();
+
+            let buf = BytesMut::with_capacity(buf_size);
+            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
+
+            let frozen_buf = blobs_buf.buf.freeze();
+
+            for meta in blobs_buf.blobs.iter() {
+                let img_buf = frozen_buf.slice(meta.start..meta.end);
+
+                key_count += 1;
+                writer
+                    .put_image(meta.meta.key, img_buf, ctx)
+                    .await
+                    .context(format!("Storing key {}", meta.meta.key))?;
+            }
+        }
+
+        Ok(key_count)
+    }
+
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -650,7 +709,7 @@ impl ImageLayerWriterInner {
                lsn,
            },
        );
-        info!("new image layer {path}");
+        trace!("creating image layer {}", path);
        let mut file = {
            VirtualFile::open_with_options(
                &path,
@@ -770,7 +829,7 @@ impl ImageLayerWriterInner {
        // FIXME: why not carry the virtualfile here, it supports renaming?
        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", layer.local_path());
+        info!("created image layer {}", layer.local_path());

        Ok(layer)
    }
@@ -855,3 +914,136 @@ impl Drop for ImageLayerWriter {
        }
    }
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::{
+        key::Key,
+        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
+    };
+    use utils::{id::TimelineId, lsn::Lsn};
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    use super::ImageLayerWriter;
+
+    #[tokio::test]
+    async fn image_layer_rewrite() {
+        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        // The LSN at which we will create an image layer to filter
+        let lsn = Lsn(0xdeadbeef0000);
+
+        let timeline_id = TimelineId::generate();
+        let timeline = tenant
+            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
+        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let range = input_start..input_end;
+
+        // Build an image layer to filter
+        let resident = {
+            let mut writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
+            let mut key = range.start;
+            while key < range.end {
+                writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
+
+                key = key.next();
+            }
+            writer.finish(&timeline, &ctx).await.unwrap()
+        };
+        let original_size = resident.metadata().file_size;
+
+        // Filter for various shards: this exercises cases like values at start of key range, end of key
+        // range, middle of key range.
+        for shard_number in 0..4 {
+            let mut filtered_writer = ImageLayerWriter::new(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                &range,
+                lsn,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
+            // to exercise filter()
+            let shard_identity = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount::new(4),
+                ShardStripeSize(0x8000),
+            )
+            .unwrap();
+
+            let wrote_keys = resident
+                .filter(&shard_identity, &mut filtered_writer, &ctx)
+                .await
+                .unwrap();
+            let replacement = if wrote_keys > 0 {
+                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
+            } else {
+                None
+            };
+
+            // This exact size and those below will need updating as/when the layer encoding changes, but
+            // should be deterministic for a given version of the format, as we used no randomness generating the input.
+            assert_eq!(original_size, 1597440);
+
+            match shard_number {
+                0 => {
+                    // We should have written out just one stripe for our shard identity
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+
+                    // We should have dropped some of the data
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+
+                    // Assert that we dropped ~3/4 of the data.
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                1 => {
+                    // Shard 1 has no keys in our input range
+                    assert_eq!(wrote_keys, 0x0);
+                    assert!(replacement.is_none());
+                }
+                2 => {
+                    // Shard 2 has one stripes in the input range
+                    assert_eq!(wrote_keys, 0x8000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 417792);
+                }
+                3 => {
+                    // Shard 3 has two stripes in the input range
+                    assert_eq!(wrote_keys, 0x10000);
+                    let replacement = replacement.unwrap();
+                    assert!(replacement.metadata().file_size < original_size);
+                    assert!(replacement.metadata().file_size > 0);
+                    assert_eq!(replacement.metadata().file_size, 811008);
+                }
+                _ => unreachable!(),
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
-use utils::sync::heavier_once_cell;
+use utils::sync::{gate, heavier_once_cell};

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
-use super::image_layer;
+use super::image_layer::{self};
 use super::{
-    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
-    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
+    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -161,7 +161,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size(),
+            metadata.file_size,
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size(),
+            metadata.file_size,
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {

        timeline
            .metrics
-            .resident_physical_size_add(metadata.file_size());
+            .resident_physical_size_add(metadata.file_size);

        ResidentLayer { downloaded, owner }
    }
@@ -1333,7 +1333,7 @@ impl LayerInner {

        is_good_to_continue(&rx.borrow_and_update())?;

-        let Ok(_gate) = timeline.gate.enter() else {
+        let Ok(gate) = timeline.gate.enter() else {
            return Err(EvictionCancelled::TimelineGone);
        };

@@ -1421,7 +1421,7 @@ impl LayerInner {
        Self::spawn_blocking(move || {
            let _span = span.entered();

-            let res = self.evict_blocking(&timeline, &permit);
+            let res = self.evict_blocking(&timeline, &gate, &permit);

            let waiters = self.inner.initializer_count();

@@ -1447,6 +1447,7 @@ impl LayerInner {
    fn evict_blocking(
        &self,
        timeline: &Timeline,
+        _gate: &gate::GateGuard,
        _permit: &heavier_once_cell::InitPermit,
    ) -> Result<(), EvictionCancelled> {
        // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
@@ -1801,16 +1802,15 @@ impl ResidentLayer {
        use LayerKind::*;

        let owner = &self.owner.0;
-
        match self.downloaded.get(owner, ctx).await? {
            Delta(ref d) => {
+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
                owner
                    .access_stats
                    .record_access(LayerAccessKind::KeyIter, ctx);

-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
                    .with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1819,6 +1819,23 @@ impl ResidentLayer {
        }
    }

+    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
+    /// the provided writer.  Return the number of keys written.
+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
+    pub(crate) async fn filter<'a>(
+        &'a self,
+        shard_identity: &ShardIdentity,
+        writer: &mut ImageLayerWriter,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<usize> {
+        use LayerKind::*;
+
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
+            Image(i) => i.filter(shard_identity, writer, ctx).await,
+        }
+    }
+
    /// Returns the amount of keys and values written to the writer.
    pub(crate) async fn copy_delta_prefix(
        &self,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,9 +23,9 @@ use pageserver_api::{
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
-        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
-        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
-        LsnLease, TimelineState,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
    },
    reltag::BlockNumber,
    shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -41,6 +41,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
+    fs_ext,
    sync::gate::{Gate, GateGuard},
    vec_map::VecMap,
 };
@@ -60,6 +61,7 @@ use std::{
    ops::ControlFlow,
 };

+use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
@@ -88,9 +90,6 @@ use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{
-    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
-};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -1424,7 +1423,7 @@ impl Timeline {
        let layer_map = guard.layer_map();
        let mut size = 0;
        for l in layer_map.iter_historic_layers() {
-            size += l.file_size();
+            size += l.file_size;
        }
        size
    }
@@ -1700,7 +1699,7 @@ impl Timeline {
            return Ok(());
        }

-        match self.get_compaction_algorithm() {
+        match self.get_compaction_algorithm_settings().kind {
            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
        }
@@ -2096,12 +2095,14 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

-    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
+    fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
        let tenant_conf = &self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .compaction_algorithm
-            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
+            .as_ref()
+            .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
+            .clone()
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
@@ -2452,8 +2453,6 @@ impl Timeline {
        let span = tracing::Span::current();

        // Copy to move into the task we're about to spawn
-        let generation = self.generation;
-        let shard = self.get_shard_index();
        let this = self.myself.upgrade().expect("&self method holds the arc");

        let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
@@ -2467,11 +2466,14 @@ impl Timeline {

                for discovered in discovered {
                    let (name, kind) = match discovered {
-                        Discovered::Layer(layer_file_name, local_path, file_size) => {
-                            discovered_layers.push((layer_file_name, local_path, file_size));
+                        Discovered::Layer(layer_file_name, local_metadata) => {
+                            discovered_layers.push((layer_file_name, local_metadata));
                            continue;
                        }
-                        Discovered::IgnoredBackup => {
+                        Discovered::IgnoredBackup(path) => {
+                            std::fs::remove_file(path)
+                                .or_else(fs_ext::ignore_not_found)
+                                .fatal_err("Removing .old file");
                            continue;
                        }
                        Discovered::Unknown(file_name) => {
@@ -2497,13 +2499,8 @@ impl Timeline {
                    );
                }

-                let decided = init::reconcile(
-                    discovered_layers,
-                    index_part.as_ref(),
-                    disk_consistent_lsn,
-                    generation,
-                    shard,
-                );
+                let decided =
+                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);

                let mut loaded_layers = Vec::new();
                let mut needs_cleanup = Vec::new();
@@ -2511,21 +2508,6 @@ impl Timeline {

                for (name, decision) in decided {
                    let decision = match decision {
-                        Ok(UseRemote { local, remote }) => {
-                            // Remote is authoritative, but we may still choose to retain
-                            // the local file if the contents appear to match
-                            if local.metadata.file_size() == remote.file_size() {
-                                // Use the local file, but take the remote metadata so that we pick up
-                                // the correct generation.
-                                UseLocal(LocalLayerFileMetadata {
-                                    metadata: remote,
-                                    local_path: local.local_path,
-                                })
-                            } else {
-                                init::cleanup_local_file_for_remote(&local, &remote)?;
-                                UseRemote { local, remote }
-                            }
-                        }
                        Ok(decision) => decision,
                        Err(DismissedLayer::Future { local }) => {
                            if let Some(local) = local {
@@ -2543,6 +2525,11 @@ impl Timeline {
                            // this file never existed remotely, we will have to do rework
                            continue;
                        }
+                        Err(DismissedLayer::BadMetadata(local)) => {
+                            init::cleanup_local_file_for_remote(&local)?;
+                            // this file never existed remotely, we will have to do rework
+                            continue;
+                        }
                    };

                    match &name {
@@ -2553,14 +2540,12 @@ impl Timeline {
                    tracing::debug!(layer=%name, ?decision, "applied");

                    let layer = match decision {
-                        UseLocal(local) => {
-                            total_physical_size += local.metadata.file_size();
-                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
+                        Resident { local, remote } => {
+                            total_physical_size += local.file_size;
+                            Layer::for_resident(conf, &this, local.local_path, name, remote)
                                .drop_eviction_guard()
                        }
-                        Evicted(remote) | UseRemote { remote, .. } => {
-                            Layer::for_evicted(conf, &this, name, remote)
-                        }
+                        Evicted(remote) => Layer::for_evicted(conf, &this, name, remote),
                    };

                    loaded_layers.push(layer);
@@ -3069,7 +3054,7 @@ impl Timeline {

            HeatMapLayer::new(
                layer.layer_desc().layer_name(),
-                (&layer.metadata()).into(),
+                layer.metadata(),
                last_activity_ts,
            )
        });
@@ -4345,7 +4330,7 @@ impl Timeline {
        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();

        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        info!(
+        debug!(
            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
                total_key_retrieved={total_key_retrieved}"
@@ -4606,6 +4591,14 @@ impl Timeline {
    ) -> Result<Vec<TimelineId>, anyhow::Error> {
        detach_ancestor::complete(self, tenant, prepared, ctx).await
    }
+
+    /// Switch aux file policy and schedule upload to the index part.
+    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
+        self.last_aux_file_policy.store(Some(policy));
+        self.remote_client
+            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
+        Ok(())
+    }
 }

 /// Top-level failure to compact.
@@ -4715,11 +4708,16 @@ impl Timeline {

    async fn rewrite_layers(
        self: &Arc<Self>,
-        replace_layers: Vec<(Layer, ResidentLayer)>,
-        drop_layers: Vec<Layer>,
+        mut replace_layers: Vec<(Layer, ResidentLayer)>,
+        mut drop_layers: Vec<Layer>,
    ) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;

+        // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
+        // to avoid double-removing, and avoid rewriting something that was removed.
+        replace_layers.retain(|(l, _)| guard.contains(l));
+        drop_layers.retain(|l| guard.contains(l));
+
        guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);

        let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
@@ -5594,26 +5592,6 @@ fn is_send() {
    _assert_send::<TimelineWriter<'_>>();
 }

-/// Add a suffix to a layer file's name: .{num}.old
-/// Uses the first available num (starts at 0)
-fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
-    let filename = path
-        .file_name()
-        .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
-    let mut new_path = path.to_owned();
-
-    for i in 0u32.. {
-        new_path.set_file_name(format!("{filename}.{i}.old"));
-        if !new_path.exists() {
-            std::fs::rename(path, &new_path)
-                .with_context(|| format!("rename {path:?} to {new_path:?}"))?;
-            return Ok(());
-        }
-    }
-
-    bail!("couldn't find an unused backup number for {:?}", path)
-}
-
 #[cfg(test)]
 mod tests {
    use utils::{id::TimelineId, lsn::Lsn};
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,10 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};
+use super::{
+    CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
+    RecordedDuration, Timeline,
+};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -22,14 +25,13 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
+use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
+use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
-use crate::tenant::PageReconstructError;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
-use crate::{page_cache, ZERO_PAGE};

 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -174,13 +176,24 @@ impl Timeline {
    async fn compact_shard_ancestors(
        self: &Arc<Self>,
        rewrite_max: usize,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut drop_layers = Vec::new();
-        let layers_to_rewrite: Vec<Layer> = Vec::new();
+        let mut layers_to_rewrite: Vec<Layer> = Vec::new();

-        // We will use the PITR cutoff as a condition for rewriting layers.
-        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;
+        // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
+        // layer is behind this Lsn, it indicates that the layer is being retained beyond the
+        // pitr_interval, for example because a branchpoint references it.
+        //
+        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
+        // are rewriting layers.
+        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
+
+        tracing::info!(
+            "latest_gc_cutoff: {}, pitr cutoff {}",
+            *latest_gc_cutoff,
+            self.gc_info.read().unwrap().cutoffs.pitr
+        );

        let layers = self.layers.read().await;
        for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -239,9 +252,9 @@ impl Timeline {

            // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
            // without incurring the I/O cost of a rewrite.
-            if layer_desc.get_lsn_range().end >= pitr_cutoff {
-                debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
-                    layer_desc.get_lsn_range().end, pitr_cutoff);
+            if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
+                debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
+                    layer_desc.get_lsn_range().end, *latest_gc_cutoff);
                continue;
            }

@@ -251,13 +264,10 @@ impl Timeline {
                continue;
            }

-            // Only rewrite layers if they would have different remote paths: either they belong to this
-            // shard but an old generation, or they belonged to another shard.  This also implicitly
-            // guarantees that the layer is persistent in remote storage (as only remote persistent
-            // layers are carried across shard splits, any local-only layer would be in the current generation)
-            if layer.metadata().generation == self.generation
-                && layer.metadata().shard.shard_count == self.shard_identity.count
-            {
+            // Only rewrite layers if their generations differ.  This guarantees:
+            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
+            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
+            if layer.metadata().generation == self.generation {
                debug!(%layer, "Skipping rewrite, is not from old generation");
                continue;
            }
@@ -270,18 +280,69 @@ impl Timeline {
            }

            // Fall through: all our conditions for doing a rewrite passed.
-            // TODO: implement rewriting
-            tracing::debug!(%layer, "Would rewrite layer");
+            layers_to_rewrite.push(layer);
        }

-        // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
+        // Drop read lock on layer map before we start doing time-consuming I/O
        drop(layers);

-        // TODO: collect layers to rewrite
-        let replace_layers = Vec::new();
+        let mut replace_image_layers = Vec::new();
+
+        for layer in layers_to_rewrite {
+            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            let mut image_layer_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &layer.layer_desc().key_range,
+                layer.layer_desc().image_layer_lsn(),
+                ctx,
+            )
+            .await?;
+
+            // Safety of layer rewrites:
+            // - We are writing to a different local file path than we are reading from, so the old Layer
+            //   cannot interfere with the new one.
+            // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
+            //   is different for two layers with the same name (in `ImageLayerInner::new` we always
+            //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
+            //   reading the index from one layer file, and then data blocks from the rewritten layer file.
+            // - Any readers that have a reference to the old layer will keep it alive until they are done
+            //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
+            //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
+            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
+            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
+            //    - ingestion, which only inserts layers, therefore cannot collide with us.
+            let resident = layer.download_and_keep_resident().await?;
+
+            let keys_written = resident
+                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
+                .await?;
+
+            if keys_written > 0 {
+                let new_layer = image_layer_writer.finish(self, ctx).await?;
+                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                    layer.metadata().file_size,
+                    new_layer.metadata().file_size);
+
+                replace_image_layers.push((layer, new_layer));
+            } else {
+                // Drop the old layer.  Usually for this case we would already have noticed that
+                // the layer has no data for us with the ShardedRange check above, but
+                drop_layers.push(layer);
+            }
+        }
+
+        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
+        // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
+        // to remote index) and be removed. This is inefficient but safe.
+        fail::fail_point!("compact-shard-ancestors-localonly");

        // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
-        self.rewrite_layers(replace_layers, drop_layers).await?;
+        self.rewrite_layers(replace_image_layers, drop_layers)
+            .await?;
+
+        fail::fail_point!("compact-shard-ancestors-enqueued");

        // We wait for all uploads to complete before finishing this compaction stage.  This is not
        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
@@ -289,6 +350,8 @@ impl Timeline {
        // load.
        self.remote_client.wait_completion().await?;

+        fail::fail_point!("compact-shard-ancestors-persistent");
+
        Ok(())
    }

@@ -1150,10 +1213,10 @@ impl TimelineAdaptor {
        lsn: Lsn,
        key_range: &Range<Key>,
        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), CreateImageLayersError> {
        let timer = self.timeline.metrics.create_images_time_histo.start_timer();

-        let mut image_layer_writer = ImageLayerWriter::new(
+        let image_layer_writer = ImageLayerWriter::new(
            self.timeline.conf,
            self.timeline.timeline_id,
            self.timeline.tenant_shard_id,
@@ -1164,47 +1227,34 @@ impl TimelineAdaptor {
        .await?;

        fail_point!("image-layer-writer-fail-before-finish", |_| {
-            Err(PageReconstructError::Other(anyhow::anyhow!(
+            Err(CreateImageLayersError::Other(anyhow::anyhow!(
                "failpoint image-layer-writer-fail-before-finish"
            )))
        });
-        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
-        for range in &keyspace_ranges {
-            let mut key = range.start;
-            while key < range.end {
-                let img = match self.timeline.get(key, lsn, ctx).await {
-                    Ok(img) => img,
-                    Err(err) => {
-                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                        // page without losing any actual user data. That seems better
-                        // than failing repeatedly and getting stuck.
-                        //
-                        // We had a bug at one point, where we truncated the FSM and VM
-                        // in the pageserver, but the Postgres didn't know about that
-                        // and continued to generate incremental WAL records for pages
-                        // that didn't exist in the pageserver. Trying to replay those
-                        // WAL records failed to find the previous image of the page.
-                        // This special case allows us to recover from that situation.
-                        // See https://github.com/neondatabase/neon/issues/2601.
-                        //
-                        // Unfortunately we cannot do this for the main fork, or for
-                        // any metadata keys, keys, as that would lead to actual data
-                        // loss.
-                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                            ZERO_PAGE.clone()
-                        } else {
-                            return Err(err);
-                        }
-                    }
-                };
-                image_layer_writer.put_image(key, img, ctx).await?;
-                key = key.next();
-            }
-        }
-        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;

-        self.new_images.push(image_layer);
+        let keyspace = KeySpace {
+            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
+        };
+        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
+        let start = Key::MIN;
+        let ImageLayerCreationOutcome {
+            image,
+            next_start_key: _,
+        } = self
+            .timeline
+            .create_image_layer_for_rel_blocks(
+                &keyspace,
+                image_layer_writer,
+                lsn,
+                ctx,
+                key_range.clone(),
+                start,
+            )
+            .await?;
+
+        if let Some(image_layer) = image {
+            self.new_images.push(image_layer);
+        }

        timer.stop_and_record();

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};
+use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
@@ -41,6 +41,27 @@ pub(crate) enum Error {
    Unexpected(#[source] anyhow::Error),
 }

+impl From<Error> for ApiError {
+    fn from(value: Error) -> Self {
+        match value {
+            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
+            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
+            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
+            Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::OtherTimelineDetachOngoing(_) => {
+                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
+            }
+            // All of these contain shutdown errors, in fact, it's the most common
+            e @ Error::FlushAncestor(_)
+            | e @ Error::RewrittenDeltaDownloadFailed(_)
+            | e @ Error::CopyDeltaPrefix(_)
+            | e @ Error::UploadRewritten(_)
+            | e @ Error::CopyFailed(_)
+            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
+        }
+    }
+}
+
 pub(crate) struct PreparedTimelineDetach {
    layers: Vec<Layer>,
 }
@@ -75,6 +96,11 @@ pub(super) async fn prepare(
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
+        // TODO: check if we have already been detached; for this we need to read the stored data
+        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
+        // a projection of the commited data.
+        //
+        // the error is wrong per openapi
        return Err(NoAncestor);
    };

@@ -84,7 +110,7 @@ pub(super) async fn prepare(

    if ancestor.ancestor_timeline.is_some() {
        // non-technical requirement; we could flatten N ancestors just as easily but we chose
-        // not to
+        // not to, at least initially
        return Err(TooManyAncestors);
    }

--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -7,19 +7,20 @@ use crate::{
            index::{IndexPart, LayerFileMetadata},
        },
        storage_layer::LayerName,
-        Generation,
    },
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::shard::ShardIndex;
-use std::{collections::HashMap, str::FromStr};
+use std::{
+    collections::{hash_map, HashMap},
+    str::FromStr,
+};
 use utils::lsn::Lsn;

 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
    /// The only one we care about
-    Layer(LayerName, Utf8PathBuf, u64),
+    Layer(LayerName, LocalLayerFileMetadata),
    /// Old ephmeral files from previous launches, should be removed
    Ephemeral(String),
    /// Old temporary timeline files, unsure what these really are, should be removed
@@ -27,7 +28,7 @@ pub(super) enum Discovered {
    /// Temporary on-demand download files, should be removed
    TemporaryDownload(String),
    /// Backup file from previously future layers
-    IgnoredBackup,
+    IgnoredBackup(Utf8PathBuf),
    /// Unrecognized, warn about these
    Unknown(String),
 }
@@ -43,12 +44,15 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
        let discovered = match LayerName::from_str(&file_name) {
            Ok(file_name) => {
                let file_size = direntry.metadata()?.len();
-                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
+                Discovered::Layer(
+                    file_name,
+                    LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
+                )
            }
            Err(_) => {
                if file_name.ends_with(".old") {
                    // ignore these
-                    Discovered::IgnoredBackup
+                    Discovered::IgnoredBackup(direntry.path().to_owned())
                } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
                    Discovered::TemporaryDownload(file_name)
                } else if is_ephemeral_file(&file_name) {
@@ -71,37 +75,32 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 /// this structure extends it with metadata describing the layer's presence in local storage.
 #[derive(Clone, Debug)]
 pub(super) struct LocalLayerFileMetadata {
-    pub(super) metadata: LayerFileMetadata,
+    pub(super) file_size: u64,
    pub(super) local_path: Utf8PathBuf,
 }

 impl LocalLayerFileMetadata {
-    pub fn new(
-        local_path: Utf8PathBuf,
-        file_size: u64,
-        generation: Generation,
-        shard: ShardIndex,
-    ) -> Self {
+    pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
        Self {
            local_path,
-            metadata: LayerFileMetadata::new(file_size, generation, shard),
+            file_size,
        }
    }
 }

-/// Decision on what to do with a layer file after considering its local and remote metadata.
+/// For a layer that is present in remote metadata, this type describes how to handle
+/// it during startup: it is either Resident (and we have some metadata about a local file),
+/// or it is Evicted (and we only have remote metadata).
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
    /// The layer is not present locally.
    Evicted(LayerFileMetadata),
-    /// The layer is present locally, but local metadata does not match remote; we must
-    /// delete it and treat it as evicted.
-    UseRemote {
+    /// The layer is present locally, and metadata matches: we may hook up this layer to the
+    /// existing file in local storage.
+    Resident {
        local: LocalLayerFileMetadata,
        remote: LayerFileMetadata,
    },
-    /// The layer is present locally, and metadata matches.
-    UseLocal(LocalLayerFileMetadata),
 }

 /// A layer needs to be left out of the layer map.
@@ -117,77 +116,81 @@ pub(super) enum DismissedLayer {
    /// In order to make crash safe updates to layer map, we must dismiss layers which are only
    /// found locally or not yet included in the remote `index_part.json`.
    LocalOnly(LocalLayerFileMetadata),
+
+    /// The layer exists in remote storage but the local layer's metadata (e.g. file size)
+    /// does not match it
+    BadMetadata(LocalLayerFileMetadata),
 }

 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
+    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
-    generation: Generation,
-    shard: ShardIndex,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    use Decision::*;
+    let Some(index_part) = index_part else {
+        // If we have no remote metadata, no local layer files are considered valid to load
+        return local_layers
+            .into_iter()
+            .map(|(layer_name, local_metadata)| {
+                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
+            })
+            .collect();
+    };

-    // name => (local_metadata, remote_metadata)
-    type Collected =
-        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;
+    let mut result = Vec::new();

-    let mut discovered = discovered
-        .into_iter()
-        .map(|(layer_name, local_path, file_size)| {
-            (
-                layer_name,
-                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
-                // it is not in IndexPart, in which case using our current generation makes sense
-                // because it will be uploaded in this generation.
-                (
-                    Some(LocalLayerFileMetadata::new(
-                        local_path, file_size, generation, shard,
-                    )),
-                    None,
-                ),
-            )
-        })
-        .collect::<Collected>();
+    let mut remote_layers = HashMap::new();

-    // merge any index_part information, when available
+    // Construct Decisions for layers that are found locally, if they're in remote metadata.  Otherwise
+    // construct DismissedLayers to get rid of them.
+    for (layer_name, local_metadata) in local_layers {
+        let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
+            result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
+            continue;
+        };
+
+        if remote_metadata.file_size != local_metadata.file_size {
+            result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
+            continue;
+        }
+
+        remote_layers.insert(
+            layer_name,
+            Decision::Resident {
+                local: local_metadata,
+                remote: remote_metadata.clone(),
+            },
+        );
+    }
+
+    // Construct Decision for layers that were not found locally
    index_part
-        .as_ref()
-        .map(|ip| ip.layer_metadata.iter())
-        .into_iter()
-        .flatten()
-        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
+        .layer_metadata
+        .iter()
        .for_each(|(name, metadata)| {
-            if let Some(existing) = discovered.get_mut(name) {
-                existing.1 = Some(metadata);
-            } else {
-                discovered.insert(name.to_owned(), (None, Some(metadata)));
+            if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
+                entry.insert(Decision::Evicted(metadata.clone()));
            }
        });

-    discovered
-        .into_iter()
-        .map(|(name, (local, remote))| {
-            let decision = if name.is_in_future(disk_consistent_lsn) {
-                Err(DismissedLayer::Future { local })
-            } else {
-                match (local, remote) {
-                    (Some(local), Some(remote)) if local.metadata != remote => {
-                        Ok(UseRemote { local, remote })
-                    }
-                    (Some(x), Some(_)) => Ok(UseLocal(x)),
-                    (None, Some(x)) => Ok(Evicted(x)),
-                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
-                    (None, None) => {
-                        unreachable!("there must not be any non-local non-remote files")
-                    }
-                }
-            };
+    // For layers that were found in authoritative remote metadata, apply a final check that they are within
+    // the disk_consistent_lsn.
+    result.extend(remote_layers.into_iter().map(|(name, decision)| {
+        if name.is_in_future(disk_consistent_lsn) {
+            match decision {
+                Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
+                Decision::Resident {
+                    local,
+                    remote: _remote,
+                } => (name, Err(DismissedLayer::Future { local: Some(local) })),
+            }
+        } else {
+            (name, Ok(decision))
+        }
+    }));

-            (name, decision)
-        })
-        .collect::<Vec<_>>()
+    result
 }

 pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -196,25 +199,15 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
    std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
 }

-pub(super) fn cleanup_local_file_for_remote(
-    local: &LocalLayerFileMetadata,
-    remote: &LayerFileMetadata,
-) -> anyhow::Result<()> {
-    let local_size = local.metadata.file_size();
-    let remote_size = remote.file_size();
+pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
+    let local_size = local.file_size;
    let path = &local.local_path;
-
    let file_name = path.file_name().expect("must be file path");
-    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
-    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
-        assert!(
-            path.exists(),
-            "we would leave the local_layer without a file if this does not hold: {path}",
-        );
-        Err(err)
-    } else {
-        Ok(())
-    }
+    tracing::warn!(
+        "removing local file {file_name:?} because it has unexpected length {local_size};"
+    );
+
+    std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
 }

 pub(super) fn cleanup_future_layer(
@@ -236,8 +229,8 @@ pub(super) fn cleanup_local_only_file(
 ) -> anyhow::Result<()> {
    let kind = name.kind();
    tracing::info!(
-        "found local-only {kind} layer {name}, metadata {:?}",
-        local.metadata
+        "found local-only {kind} layer {name} size {}",
+        local.file_size
    );
    std::fs::remove_file(&local.local_path)?;
    Ok(())
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -212,13 +212,34 @@ impl LayerManager {
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
        drop_layers: &[Layer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
+        for (old_layer, new_layer) in rewrite_layers {
+            debug_assert_eq!(
+                old_layer.layer_desc().key_range,
+                new_layer.layer_desc().key_range
+            );
+            debug_assert_eq!(
+                old_layer.layer_desc().lsn_range,
+                new_layer.layer_desc().lsn_range
+            );

-        // TODO: implement rewrites (currently this code path only used for drops)
-        assert!(rewrite_layers.is_empty());
+            // Safety: we may never rewrite the same file in-place.  Callers are responsible
+            // for ensuring that they only rewrite layers after something changes the path,
+            // such as an increment in the generation number.
+            assert_ne!(old_layer.local_path(), new_layer.local_path());

+            Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
+
+            Self::insert_historic_layer(
+                new_layer.as_ref().clone(),
+                &mut updates,
+                &mut self.layer_fmgr,
+            );
+
+            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
+        }
        for l in drop_layers {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -213,10 +213,7 @@ impl UploadQueue {

        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
        for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(
-                layer_name.to_owned(),
-                LayerFileMetadata::from(layer_metadata),
-            );
+            files.insert(layer_name.to_owned(), layer_metadata.clone());
        }

        info!(
@@ -322,9 +319,7 @@ impl std::fmt::Display for UploadOp {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer,
-                    metadata.file_size(),
-                    metadata.generation
+                    layer, metadata.file_size, metadata.generation
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,9 +49,8 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int         neon_protocol_version = 1;
+int         neon_protocol_version = 2;

-static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;

@@ -95,18 +94,44 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;

+typedef enum PSConnectionState {
+	PS_Disconnected,			/* no connection yet */
+	PS_Connecting_Startup,		/* connection starting up */
+	PS_Connecting_PageStream,	/* negotiating pagestream */ 
+	PS_Connected,				/* connected, pagestream established */
+} PSConnectionState;
+
 /* This backend's per-shard connections */
 typedef struct
 {
-	PGconn	   *conn;
+	TimestampTz		last_connect_time; /* read-only debug value */
+	TimestampTz		last_reconnect_time;
+	uint32			delay_us;
+	int				n_reconnect_attempts;

 	/*---
-	 * WaitEventSet containing:
-	 * - WL_SOCKET_READABLE on 'conn'
-	 * - WL_LATCH_SET on MyLatch, and
-	 * - WL_EXIT_ON_PM_DEATH.
+	 * Pageserver connection state, i.e.
+	 *	disconnected: conn == NULL, wes == NULL;
+	 *	conn_startup: connection initiated, waiting for connection establishing
+	 *	conn_ps:      PageStream query sent, waiting for confirmation
+	 *	connected:    PageStream established
 	 */
-	WaitEventSet *wes;
+	PSConnectionState state;
+	PGconn		   *conn;
+	/*---
+	 * WaitEventSet containing:
+	 *	- WL_SOCKET_READABLE on 'conn'
+	 *	- WL_LATCH_SET on MyLatch, and
+	 *	- WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet   *wes_read;
+	/*---
+	 * WaitEventSet containing:
+	 *	- WL_SOCKET_WRITABLE on 'conn'
+	 *	- WL_LATCH_SET on MyLatch, and
+	 *	- WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet   *wes_write;
 } PageServer;

 static PageServer page_servers[MAX_SHARDS];
@@ -303,119 +328,269 @@ get_shard_number(BufferTag *tag)
 	return hash % n_shards;
 }

+static inline void
+CLEANUP_AND_DISCONNECT(PageServer *shard) 
+{
+	if (shard->wes_read)
+	{
+		FreeWaitEventSet(shard->wes_read);
+		shard->wes_read = NULL;
+	}
+	if (shard->wes_write)
+	{
+		FreeWaitEventSet(shard->wes_write);
+		shard->wes_write = NULL;
+	}
+	if (shard->conn)
+	{
+		PQfinish(shard->conn);
+		shard->conn = NULL;
+	}
+
+	shard->state = PS_Disconnected;
+}
+
+/*
+ * Connect to a pageserver, or continue to try to connect if we're yet to
+ * complete the connection (e.g. due to receiving an earlier cancellation
+ * during connection start).
+ * Returns true if successfully connected; false if the connection failed.
+ * 
+ * Throws errors in unrecoverable situations, or when this backend's query
+ * is canceled.
+ */
 static bool
 pageserver_connect(shardno_t shard_no, int elevel)
 {
-	char	   *query;
-	int			ret;
-	const char *keywords[3];
-	const char *values[3];
-	int			n;
-	PGconn	   *conn;
-	WaitEventSet *wes;
+	PageServer *shard = &page_servers[shard_no];
 	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];

-	static TimestampTz last_connect_time = 0;
-	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	TimestampTz now;
-	uint64_t	us_since_last_connect;
-	bool	broke_from_loop = false;
-
-	Assert(page_servers[shard_no].conn == NULL);
-
 	/*
 	 * Get the connection string for this shard. If the shard map has been
 	 * updated since we last looked, this will also disconnect any existing
 	 * pageserver connections as a side effect.
+	 * Note that connstr is used both during connection start, and when we
+	 * log the successful connection.
 	 */
 	load_shard_map(shard_no, connstr, NULL);

-	now = GetCurrentTimestamp();
-	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
+	switch (shard->state)
 	{
-		pg_usleep(delay_us);
-		delay_us *= 2;
-	}
-	else
+	case PS_Disconnected:
 	{
-		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	}
+		const char *keywords[3];
+		const char *values[3];
+		int			n_pgsql_params;
+		TimestampTz	now;
+		int64		us_since_last_attempt;

-	/*
-	 * Connect using the connection string we got from the
-	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
-	 * variable was set, use that as the password.
-	 *
-	 * The connection options are parsed in the order they're given, so when
-	 * we set the password before the connection string, the connection string
-	 * can override the password from the env variable. Seems useful, although
-	 * we don't currently use that capability anywhere.
-	 */
-	n = 0;
-	if (neon_auth_token)
-	{
-		keywords[n] = "password";
-		values[n] = neon_auth_token;
-		n++;
+		/* Make sure we start with a clean slate */
+		CLEANUP_AND_DISCONNECT(shard);
+
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
+
+		now = GetCurrentTimestamp();
+		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
+		shard->last_reconnect_time = now;
+
+		/*
+		 * If we did other tasks between reconnect attempts, then we won't
+		 * need to wait as long as a full delay.
+		 */
+		if (us_since_last_attempt < shard->delay_us)
+		{
+			pg_usleep(shard->delay_us - us_since_last_attempt);
+		}
+
+		/* update the delay metric */
+		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
+
+		/*
+		 * Connect using the connection string we got from the
+		 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+		 * variable was set, use that as the password.
+		 *
+		 * The connection options are parsed in the order they're given, so when
+		 * we set the password before the connection string, the connection string
+		 * can override the password from the env variable. Seems useful, although
+		 * we don't currently use that capability anywhere.
+		 */
+		keywords[0] = "dbname";
+		values[0] = connstr;
+		n_pgsql_params = 1;
+
+		if (neon_auth_token)
+		{
+			keywords[1] = "password";
+			values[1] = neon_auth_token;
+			n_pgsql_params++;
+		}
+
+		keywords[n_pgsql_params] = NULL;
+		values[n_pgsql_params] = NULL;
+
+		shard->conn = PQconnectStartParams(keywords, values, 1);
+		if (!shard->conn)
+		{
+			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
+			return false;
+		}
+
+		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
+
+		shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
+						  MyLatch, NULL);
+		AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+						  NULL, NULL);
+		AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
+						  PQsocket(shard->conn),
+						  NULL, NULL);
+
+		shard->state = PS_Connecting_Startup;
+		/* fallthrough */
 	}
-	keywords[n] = "dbname";
-	values[n] = connstr;
-	n++;
-	keywords[n] = NULL;
-	values[n] = NULL;
-	n++;
-	conn = PQconnectdbParams(keywords, values, 1);
-	last_connect_time = GetCurrentTimestamp();
-
-	if (PQstatus(conn) == CONNECTION_BAD)
+	case PS_Connecting_Startup:
 	{
-		char	   *msg = pchomp(PQerrorMessage(conn));
+		char	   *pagestream_query;
+		int			ps_send_query_ret;
+		bool		connected = false;

-		PQfinish(conn);
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");

-		ereport(elevel,
-				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-				 errdetail_internal("%s", msg)));
-		pfree(msg);
-		return false;
-	}
-	switch (neon_protocol_version)
-	{
+		do
+		{
+			WaitEvent	event;
+			int			poll_result = PQconnectPoll(shard->conn);
+
+			switch (poll_result)
+			{
+			default: /* unknown/unused states are handled as a failed connection */
+			case PGRES_POLLING_FAILED:
+				{
+					char	   *pqerr = PQerrorMessage(shard->conn);
+					char	   *msg = NULL;
+					neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
+
+					if (pqerr)
+						msg = pchomp(pqerr);
+
+					CLEANUP_AND_DISCONNECT(shard);
+
+					if (msg)
+					{
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver: %s",
+									   msg);
+						pfree(msg);
+					}
+					else
+						neon_shard_log(shard_no, elevel,
+									   "could not connect to pageserver");
+
+					return false;
+				}
+			case PGRES_POLLING_READING:
+				/* Sleep until there's something to do */
+				(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+										PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				/* query cancellation, backend shutdown */
+				CHECK_FOR_INTERRUPTS();
+
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_WRITING:
+				/* Sleep until there's something to do */
+				(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
+										PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				/* query cancellation, backend shutdown */
+				CHECK_FOR_INTERRUPTS();
+
+				/* PQconnectPoll() handles the socket polling state updates */
+
+				break;
+			case PGRES_POLLING_OK:
+				neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
+				connected = true;
+				break;
+			}
+		}
+		while (!connected);
+
+		/* No more polling needed; connection succeeded */
+		shard->last_connect_time = GetCurrentTimestamp();
+
+		switch (neon_protocol_version)
+		{
 		case 2:
-			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
 		case 1:
-			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-	}
-	ret = PQsendQuery(conn, query);
-	pfree(query);
-	if (ret != 1)
-	{
-		PQfinish(conn);
-		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
-		return false;
-	}
+		}

-	wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
-					  MyLatch, NULL);
-	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-					  NULL, NULL);
-	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));

-	PG_TRY();
+			CLEANUP_AND_DISCONNECT(shard);
+
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
+		pfree(pagestream_query);
+		if (ps_send_query_ret != 1)
+		{
+			CLEANUP_AND_DISCONNECT(shard);
+
+			neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
+			return false;
+		}
+
+		shard->state = PS_Connecting_PageStream;
+		/* fallthrough */
+	}
+	case PS_Connecting_PageStream:
 	{
-		while (PQisBusy(conn))
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
+
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
+			CLEANUP_AND_DISCONNECT(shard);
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
+			return false;
+		}
+
+		while (PQisBusy(shard->conn))
 		{
 			WaitEvent	event;

 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 			ResetLatch(MyLatch);

 			CHECK_FOR_INTERRUPTS();
@@ -423,40 +598,37 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			/* Data available in socket? */
 			if (event.events & WL_SOCKET_READABLE)
 			{
-				if (!PQconsumeInput(conn))
+				if (!PQconsumeInput(shard->conn))
 				{
-					char	   *msg = pchomp(PQerrorMessage(conn));
-
-					PQfinish(conn);
-					FreeWaitEventSet(wes);
+					char	   *msg = pchomp(PQerrorMessage(shard->conn));

+					CLEANUP_AND_DISCONNECT(shard);
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					/* Returning from inside PG_TRY is bad, so we break/return later */
-					broke_from_loop = true;
-					break;
+					pfree(msg);
+					return false;
 				}
 			}
 		}
-	}
-	PG_CATCH();
-	{
-		PQfinish(conn);
-		FreeWaitEventSet(wes);
-		PG_RE_THROW();
-	}
-	PG_END_TRY();

-	if (broke_from_loop)
-	{
-		return false;
+		shard->state = PS_Connected;
+		/* fallthrough */
 	}
+	case PS_Connected:
+		/*
+		 * We successfully connected. Future connections to this PageServer
+		 * will do fast retries again, with exponential backoff.
+		 */
+		shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;

-	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
-	page_servers[shard_no].conn = conn;
-	page_servers[shard_no].wes = wes;
-
-	return true;
+		neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
+		neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+		return true;
+	default:
+		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
+	}
+	/* This shouldn't be hit */
+	Assert(false);
 }

 /*
@@ -476,7 +648,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -502,7 +674,8 @@ retry:

 /*
 * Reset prefetch and drop connection to the shard.
- * It also drops connection to all other shards involved in prefetch.
+ * It also drops connection to all other shards involved in prefetch, through
+ * prefetch_on_ps_disconnect().
 */
 static void
 pageserver_disconnect(shardno_t shard_no)
@@ -512,9 +685,6 @@ pageserver_disconnect(shardno_t shard_no)
 	 * whole prefetch queue, even for other pageservers. It should not
 	 * cause big problems, because connection loss is supposed to be a
 	 * rare event.
-	 *
-	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
-	 * because prefetch request may be registered before connection is established.
 	 */
 	prefetch_on_ps_disconnect();

@@ -527,37 +697,36 @@ pageserver_disconnect(shardno_t shard_no)
 static void
 pageserver_disconnect_shard(shardno_t shard_no)
 {
+	PageServer *shard = &page_servers[shard_no];
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
 	 * what state the connection is in. For example, if we sent the request
 	 * but didn't receive a response yet, we might receive the response some
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
+	 * Similarly, even when we're in PS_DISCONNECTED, we may have junk to
+	 * clean up: It is possible that we encountered an error allocating any
+	 * of the wait event sets or the psql connection, or failed when we tried
+	 * to attach wait events to the WaitEventSets.
 	 */
-	if (page_servers[shard_no].conn)
-	{
-		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
-		PQfinish(page_servers[shard_no].conn);
-		page_servers[shard_no].conn = NULL;
-	}
-	if (page_servers[shard_no].wes != NULL)
-	{
-		FreeWaitEventSet(page_servers[shard_no].wes);
-		page_servers[shard_no].wes = NULL;
-	}
+	CLEANUP_AND_DISCONNECT(shard);
+
+	shard->state = PS_Disconnected;
 }

 static bool
 pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
 	{
 		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect(shard_no);
+		pageserver_conn = NULL;
 	}

 	req_buff = nm_pack_request(request);
@@ -571,17 +740,19 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!page_servers[shard_no].conn)
+	if (shard->state != PS_Connected)
 	{
-		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
-			n_reconnect_attempts += 1;
+			shard->n_reconnect_attempts += 1;
 		}
-		n_reconnect_attempts = 0;
+		shard->n_reconnect_attempts = 0;
+	} else {
+		Assert(shard->conn != NULL);
 	}

-	pageserver_conn = page_servers[shard_no].conn;
+	pageserver_conn = shard->conn;

 	/*
 	 * Send request.
@@ -590,13 +761,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * should use async mode and check for interrupts while waiting. In
 	 * practice, our requests are small enough to always fit in the output and
 	 * TCP buffer.
+	 *
+	 * Note that this also will fail when the connection is in the
+	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
+	 * point, but on the grand scheme of things it's only a small issue.
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

 		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -611,6 +786,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
+
 	return true;
 }

@@ -619,58 +795,68 @@ pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn = shard->conn;
+	/* read response */
+	int			rc;

-	if (!pageserver_conn)
-		return NULL;
-
-	PG_TRY();
+	if (shard->state != PS_Connected)
 	{
-		/* read response */
-		int			rc;
+		neon_shard_log(shard_no, LOG,
+					   "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
+					   shard->state);
+		return NULL;
+	}

-		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
-		if (rc >= 0)
+	Assert(pageserver_conn);
+
+	rc = call_PQgetCopyData(shard_no, &resp_buff.data);
+	if (rc >= 0)
+	{
+		/* call_PQgetCopyData handles rc == 0 */
+		Assert(rc > 0);
+
+		PG_TRY();
 		{
 			resp_buff.len = rc;
 			resp_buff.cursor = 0;
 			resp = nm_unpack_response(&resp_buff);
 			PQfreemem(resp_buff.data);
-
-			if (message_level_is_interesting(PageStoreTrace))
-			{
-				char	   *msg = nm_to_string((NeonMessage *) resp);
-
-				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
-				pfree(msg);
-			}
 		}
-		else if (rc == -1)
+		PG_CATCH();
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
 			pageserver_disconnect(shard_no);
-			resp = NULL;
+			PG_RE_THROW();
 		}
-		else if (rc == -2)
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		PG_END_TRY();

-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
-		}
-		else
+		if (message_level_is_interesting(PageStoreTrace))
 		{
-			pageserver_disconnect(shard_no);
-			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			char	   *msg = nm_to_string((NeonMessage *) resp);
+
+			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+			pfree(msg);
 		}
 	}
-	PG_CATCH();
+	else if (rc == -1)
 	{
-		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		pageserver_disconnect(shard_no);
-		PG_RE_THROW();
+		resp = NULL;
+	}
+	else if (rc == -2)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
+	}
+	else
+	{
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}
-	PG_END_TRY();

 	return (NeonResponse *) resp;
 }
@@ -681,7 +867,7 @@ pageserver_flush(shardno_t shard_no)
 {
 	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (!pageserver_conn)
+	if (page_servers[shard_no].state != PS_Connected)
 	{
 		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
@@ -697,6 +883,7 @@ pageserver_flush(shardno_t shard_no)
 			return false;
 		}
 	}
+
 	return true;
 }

@@ -860,7 +1047,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							1, /* default to old protocol for now */
+							2, /* use protocol version 2 */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,
@@ -891,5 +1078,7 @@ pg_init_libpagestore(void)
 		dbsize_hook = neon_dbsize;
 	}

+	memset(page_servers, 0, sizeof(page_servers));
+
 	lfc_init();
 }
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -94,6 +94,10 @@ static char *hexdump_page(char *page);

 const int	SmgrTrace = DEBUG5;

+#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
+	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
+				   ##__VA_ARGS__)
+
 page_server_api *page_server;

 /* unlogged relation build states */
@@ -526,6 +530,8 @@ prefetch_flush_requests(void)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
+ * NOTE: callers should make sure they can handle query cancellations in this
+ * function's call path.
 */
 static bool
 prefetch_wait_for(uint64 ring_index)
@@ -561,6 +567,8 @@ prefetch_wait_for(uint64 ring_index)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
+ *
+ * NOTE: this does IO, and can get canceled out-of-line.
 */
 static bool
 prefetch_read(PrefetchRequest *slot)
@@ -572,6 +580,14 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);

+	if (slot->status != PRFS_REQUESTED ||
+		slot->response != NULL ||
+		slot->my_ring_index != MyPState->ring_receive)
+		neon_shard_log(slot->shard_no, ERROR,
+					   "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
+					   slot->status, (size_t) (void *) slot->response,
+					   slot->my_ring_index, MyPState->ring_receive);
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
@@ -589,6 +605,11 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
+		neon_shard_log(slot->shard_no, WARNING,
+					   "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   slot->my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
+					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;
 	}
 }
@@ -603,6 +624,7 @@ void
 prefetch_on_ps_disconnect(void)
 {
 	MyPState->ring_flush = MyPState->ring_unused;
+
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -625,6 +647,7 @@ prefetch_on_ps_disconnect(void)
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
+
 		prefetch_set_unused(ring_index);
 	}
 }
@@ -691,6 +714,8 @@ static void
 prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
+	uint64		mySlotNo = slot->my_ring_index;
+
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		/* lsn and not_modified_since are filled in below */
@@ -699,6 +724,8 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 		.blkno = slot->buftag.blockNum,
 	};

+	Assert(mySlotNo == MyPState->ring_unused);
+
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
@@ -711,7 +738,11 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
+	{
+		Assert(mySlotNo == MyPState->ring_unused);
+		/* loop */
+	}

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -722,7 +753,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -894,6 +924,10 @@ Retry:
 	return ring_index;
 }

+/*
+ * Note: this function can get canceled and use a long jump to the next catch
+ * context. Take care.
+ */
 static NeonResponse *
 page_server_request(void const *req)
 {
@@ -925,19 +959,38 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
+		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
 	{
 		shard_no = 0;
 	}

 	do
 	{
-		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
-		consume_prefetch_responses();
-		resp = page_server->receive(shard_no);
-	} while (resp == NULL);
-	return resp;
+		PG_TRY();
+		{
+			while (!page_server->send(shard_no, (NeonRequest *) req)
+				   || !page_server->flush(shard_no))
+			{
+				/* do nothing */
+			}
+			consume_prefetch_responses();
+			resp = page_server->receive(shard_no);
+		}
+		PG_CATCH();
+		{
+			/*
+			 * Cancellation in this code needs to be handled better at some
+			 * point, but this currently seems fine for now.
+			 */
+			page_server->disconnect(shard_no);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();

+	} while (resp == NULL);
+
+	return resp;
 }


@@ -1349,6 +1402,10 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }

+/*
+ * A page is being evicted from the shared buffer cache. Update the
+ * last-written LSN of the page, and WAL-log it if needed.
+ */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1357,12 +1414,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
-
-	if (ShutdownRequestPending)
-		return;
-	/* Don't log any pages if we're not allowed to do so. */
-	if (!XLogInsertAllowed())
-		return;
+	bool		log_page;

 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1371,9 +1423,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
+	log_page = false;
+	if (force)
+	{
+		Assert(XLogInsertAllowed());
+		log_page = true;
+	}
+	else if (XLogInsertAllowed() &&
+			 !ShutdownRequestPending &&
+			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
+	{
+		log_page = true;
+	}
+
+	if (log_page)
 	{
-		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;

 		recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1386,7 +1450,8 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-	else if (lsn == InvalidXLogRecPtr)
+
+	if (lsn == InvalidXLogRecPtr)
 	{
 		/*
 		 * When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1422,19 +1487,31 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		else
+		else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
 		{
-			ereport(PANIC,
+			/*
+			 * Its a bad sign if there is a page with zero LSN in the buffer
+			 * cache in a standby, too. However, PANICing seems like a cure
+			 * worse than the disease, as the damage has likely already been
+			 * done in the primary. So in a standby, make this an assertion,
+			 * and in a release build just LOG the error and soldier on. We
+			 * update the last-written LSN of the page with a conservative
+			 * value in that case, which is the last replayed LSN.
+			 */
+			ereport(RecoveryInProgress() ? LOG : PANIC,
 					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
+			Assert(false);
+
+			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
+				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1527,8 +1604,92 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)

 	if (RecoveryInProgress())
 	{
-		/* Request the page at the last replayed LSN. */
-		result.request_lsn = GetXLogReplayRecPtr(NULL);
+		/*---
+		 * In broad strokes, a replica always requests the page at the current
+		 * replay LSN. But looking closer, what exactly is the replay LSN? Is
+		 * it the last replayed record, or the record being replayed? And does
+		 * the startup process performing the replay need to do something
+		 * differently than backends running queries? Let's take a closer look
+		 * at the different scenarios:
+		 *
+		 * 1. Startup process reads a page, last_written_lsn is old.
+		 *
+		 * Read the old version of the page. We will apply the WAL record on
+		 * it to bring it up-to-date.
+		 *
+		 * We could read the new version, with the changes from this WAL
+		 * record already applied, to offload the work of replaying the record
+		 * to the pageserver. The pageserver might not have received the WAL
+		 * record yet, though, so a read of the old page version and applying
+		 * the record ourselves is likely faster. Also, the redo function
+		 * might be surprised if the changes have already applied. That's
+		 * normal during crash recovery, but not in hot standby.
+		 *
+		 * 2. Startup process reads a page, last_written_lsn == record we're
+		 *    replaying.
+		 *
+		 * Can this happen? There are a few theoretical cases when it might:
+		 *
+		 * A) The redo function reads the same page twice. We had already read
+		 *    and applied the changes once, and now we're reading it for the
+		 *    second time.  That would be a rather silly thing for a redo
+		 *    function to do, and I'm not aware of any that would do it.
+		 *
+		 * B) The redo function modifies multiple pages, and it already
+		 *    applied the changes to one of the pages, released the lock on
+		 *    it, and is now reading a second page.  Furthermore, the first
+		 *    page was already evicted from the buffer cache, and also from
+		 *    the last-written LSN cache, so that the per-relation or global
+		 *    last-written LSN was already updated. All the WAL redo functions
+		 *    hold the locks on pages that they modify, until all the changes
+		 *    have been modified (?), which would make that impossible.
+		 *    However, we skip the locking, if the page isn't currently in the
+		 *    page cache (see neon_redo_read_buffer_filter below).
+		 *
+		 * Even if the one of the above cases were possible in theory, they
+		 * would also require the pages being modified by the redo function to
+		 * be immediately evicted from the page cache.
+		 *
+		 * So this probably does not happen in practice. But if it does, we
+		 * request the new version, including the changes from the record
+		 * being replayed. That seems like the correct behavior in any case.
+		 *
+		 * 3. Backend process reads a page with old last-written LSN
+		 *
+		 * Nothing special here. Read the old version.
+		 *
+		 * 4. Backend process reads a page with last_written_lsn == record being replayed
+		 *
+		 * This can happen, if the redo function has started to run, and saw
+		 * that the page isn't present in the page cache (see
+		 * neon_redo_read_buffer_filter below).  Normally, in a normal
+		 * Postgres server, the redo function would hold a lock on the page,
+		 * so we would get blocked waiting the redo function to release the
+		 * lock. To emulate that, wait for the WAL replay of the record to
+		 * finish.
+		 */
+		/* Request the page at the end of the last fully replayed LSN. */
+		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
+
+		if (last_written_lsn > replay_lsn)
+		{
+			/* GetCurrentReplayRecPtr was introduced in v15 */
+#if PG_VERSION_NUM >= 150000
+			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
+#endif
+
+			/*
+			 * Cases 2 and 4. If this is a backend (case 4), the
+			 * neon_read_at_lsn() call later will wait for the WAL record to be
+			 * fully replayed.
+			 */
+			result.request_lsn = last_written_lsn;
+		}
+		else
+		{
+			/* cases 1 and 3 */
+			result.request_lsn = replay_lsn;
+		}
 		result.not_modified_since = last_written_lsn;
 		result.effective_request_lsn = result.request_lsn;
 		Assert(last_written_lsn <= result.request_lsn);
@@ -1797,7 +1958,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
+										T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2249,7 +2412,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
-  Retry:
+Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);

 	if (entry != NULL)
@@ -2335,7 +2498,9 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2606,7 +2771,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
+										T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

@@ -2659,7 +2826,9 @@ neon_dbsize(Oid dbNode)
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
+										T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
 	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -2998,7 +3167,9 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;

 		default:
-			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
+										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
 	}
 	pfree(resp);

@@ -3216,7 +3387,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
-	Buffer		buffer;
+	int			buf_id;
 	bool		no_redo_needed;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3254,20 +3425,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/* Try to find the relevant buffer */
-		buffer = BufTableLookup(&tag, hash);
+		buf_id = BufTableLookup(&tag, hash);

-		no_redo_needed = buffer < 0;
+		no_redo_needed = buf_id < 0;
 	}
-	/* In both cases st lwlsn past this WAL record */
-	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
 	 * evict page from file cache
 	 */
 	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 		lfc_evict(rinfo, forknum, blkno);
-
+	}

 	LWLockRelease(partitionLock);

--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -9,6 +9,7 @@ default = []
 testing = []

 [dependencies]
+ahash.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
@@ -24,6 +25,7 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -52,7 +54,6 @@ opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
-pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -106,6 +107,7 @@ workspace_hack.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 tokio-tungstenite.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -365,7 +365,10 @@ async fn authenticate_with_secret(
    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
    if let Some(password) = unauthenticated_password {
-        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
+        let ep = EndpointIdInt::from(&info.endpoint);
+
+        let auth_outcome =
+            validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
        let keys = match auth_outcome {
            crate::sasl::Outcome::Success(key) => key,
            crate::sasl::Outcome::Failure(reason) => {
@@ -386,7 +389,7 @@ async fn authenticate_with_secret(
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
+        return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
@@ -554,7 +557,7 @@ mod tests {
        context::RequestMonitoring,
        proxy::NeonOptions,
        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::ServerSecret,
+        scram::{threadpool::ThreadPool, ServerSecret},
        stream::{PqStream, Stream},
    };

@@ -596,6 +599,7 @@ mod tests {
    }

    static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
+        thread_pool: ThreadPool::new(1),
        scram_protocol_timeout: std::time::Duration::from_secs(5),
        rate_limiter_enabled: true,
        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -3,8 +3,10 @@ use super::{
 };
 use crate::{
    auth::{self, AuthFlow},
+    config::AuthenticationConfig,
    console::AuthSecret,
    context::RequestMonitoring,
+    intern::EndpointIdInt,
    sasl,
    stream::{self, Stream},
 };
@@ -20,6 +22,7 @@ pub async fn authenticate_cleartext(
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    secret: AuthSecret,
+    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
    warn!("cleartext auth flow override is enabled, proceeding");
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -27,8 +30,14 @@ pub async fn authenticate_cleartext(
    // pause the timer while we communicate with the client
    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

+    let ep = EndpointIdInt::from(&info.endpoint);
+
    let auth_flow = AuthFlow::new(client)
-        .begin(auth::CleartextPassword(secret))
+        .begin(auth::CleartextPassword {
+            secret,
+            endpoint: ep,
+            pool: config.thread_pool.clone(),
+        })
        .await?;
    drop(paused);
    // cleartext auth is only allowed to the ws/http protocol.
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -5,12 +5,14 @@ use crate::{
    config::TlsServerEndPoint,
    console::AuthSecret,
    context::RequestMonitoring,
-    sasl, scram,
+    intern::EndpointIdInt,
+    sasl,
+    scram::{self, threadpool::ThreadPool},
    stream::{PqStream, Stream},
 };
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
-use std::io;
+use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;

@@ -53,7 +55,11 @@ impl AuthMethod for PasswordHack {

 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword(pub AuthSecret);
+pub struct CleartextPassword {
+    pub pool: Arc<ThreadPool>,
+    pub endpoint: EndpointIdInt,
+    pub secret: AuthSecret,
+}

 impl AuthMethod for CleartextPassword {
    #[inline(always)]
@@ -126,7 +132,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
            .strip_suffix(&[0])
            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;

-        let outcome = validate_password_and_exchange(password, self.state.0).await?;
+        let outcome = validate_password_and_exchange(
+            &self.state.pool,
+            self.state.endpoint,
+            password,
+            self.state.secret,
+        )
+        .await?;

        if let sasl::Outcome::Success(_) = &outcome {
            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -181,6 +193,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
 }

 pub(crate) async fn validate_password_and_exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
    password: &[u8],
    secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -194,7 +208,7 @@ pub(crate) async fn validate_password_and_exchange(
        }
        // perform scram authentication as both client and server to validate the keys
        AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(&scram_secret, password).await?;
+            let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;

            let client_key = match outcome {
                sasl::Outcome::Success(client_key) => client_key,
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,6 +9,7 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
+use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;
@@ -65,6 +66,8 @@ async fn main() -> anyhow::Result<()> {
    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

+    Metrics::install(Arc::new(ThreadPoolMetrics::new(0)));
+
    let args = cli().get_matches();
    let destination: String = args.get_one::<String>("dest").unwrap().parse()?;

--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -27,6 +27,7 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
 use proxy::redis::notifications;
+use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
@@ -132,6 +133,9 @@ struct ProxyCliArgs {
    /// timeout for scram authentication protocol
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    scram_protocol_timeout: tokio::time::Duration,
+    /// size of the threadpool for password hashing
+    #[clap(long, default_value_t = 4)]
+    scram_thread_pool_size: u8,
    /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
@@ -352,7 +356,7 @@ async fn main() -> anyhow::Result<()> {

    let cancel_map = CancelMap::default();

-    let redis_publisher = match &regional_redis_client {
+    let redis_publisher = match &redis_notifications_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
@@ -489,6 +493,9 @@ async fn main() -> anyhow::Result<()> {

 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
+    let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
+    Metrics::install(thread_pool.metrics.clone());
+
    let tls_config = match (&args.tls_key, &args.tls_cert) {
        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
            key_path,
@@ -624,6 +631,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
    };
    let authentication_config = AuthenticationConfig {
+        thread_pool,
        scram_protocol_timeout: args.scram_protocol_timeout,
        rate_limiter_enabled: args.auth_rate_limit_enabled,
        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -2,6 +2,7 @@ use crate::{
    auth::{self, backend::AuthRateLimiter},
    console::locks::ApiLocks,
    rate_limiter::RateBucketInfo,
+    scram::threadpool::ThreadPool,
    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
    Host,
 };
@@ -61,6 +62,7 @@ pub struct HttpConfig {
 }

 pub struct AuthenticationConfig {
+    pub thread_pool: Arc<ThreadPool>,
    pub scram_protocol_timeout: tokio::time::Duration,
    pub rate_limiter_enabled: bool,
    pub rate_limiter: AuthRateLimiter,
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -355,7 +355,7 @@ async fn upload_parquet(
        "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
    ))?;
    let cancel = CancellationToken::new();
-    backoff::retry(
+    let maybe_err = backoff::retry(
        || async {
            let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
            storage
@@ -372,7 +372,12 @@ async fn upload_parquet(
    .await
    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
    .and_then(|x| x)
-    .context("request_data_upload")?;
+    .context("request_data_upload")
+    .err();
+
+    if let Some(err) = maybe_err {
+        tracing::warn!(%id, %err, "failed to upload request data");
+    }

    Ok(buffer.writer())
 }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,11 +1,11 @@
-use std::sync::OnceLock;
+use std::sync::{Arc, OnceLock};

 use lasso::ThreadedRodeo;
 use measured::{
-    label::StaticLabelSet,
+    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
    metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
-    MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
+    LabelGroup, MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};

@@ -14,26 +14,36 @@ use tokio::time::{self, Instant};
 use crate::console::messages::ColdStartInfo;

 #[derive(MetricGroup)]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct Metrics {
    #[metric(namespace = "proxy")]
+    #[metric(init = ProxyMetrics::new(thread_pool))]
    pub proxy: ProxyMetrics,

    #[metric(namespace = "wake_compute_lock")]
    pub wake_compute_lock: ApiLockMetrics,
 }

+static SELF: OnceLock<Metrics> = OnceLock::new();
 impl Metrics {
+    pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
+        SELF.set(Metrics::new(thread_pool))
+            .ok()
+            .expect("proxy metrics must not be installed more than once");
+    }
+
    pub fn get() -> &'static Self {
-        static SELF: OnceLock<Metrics> = OnceLock::new();
-        SELF.get_or_init(|| Metrics {
-            proxy: ProxyMetrics::default(),
-            wake_compute_lock: ApiLockMetrics::new(),
-        })
+        #[cfg(test)]
+        return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
+
+        #[cfg(not(test))]
+        SELF.get()
+            .expect("proxy metrics must be installed by the main() function")
    }
 }

 #[derive(MetricGroup)]
-#[metric(new())]
+#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct ProxyMetrics {
    #[metric(flatten)]
    pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
@@ -129,6 +139,10 @@ pub struct ProxyMetrics {

    #[metric(namespace = "connect_compute_lock")]
    pub connect_compute_lock: ApiLockMetrics,
+
+    #[metric(namespace = "scram_pool")]
+    #[metric(init = thread_pool)]
+    pub scram_pool: Arc<ThreadPoolMetrics>,
 }

 #[derive(MetricGroup)]
@@ -146,12 +160,6 @@ pub struct ApiLockMetrics {
    pub semaphore_acquire_seconds: Histogram<16>,
 }

-impl Default for ProxyMetrics {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl Default for ApiLockMetrics {
    fn default() -> Self {
        Self::new()
@@ -553,3 +561,52 @@ pub enum RedisEventsCount {
    PasswordUpdate,
    AllowedIpsUpdate,
 }
+
+pub struct ThreadPoolWorkers(usize);
+pub struct ThreadPoolWorkerId(pub usize);
+
+impl LabelValue for ThreadPoolWorkerId {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int(self.0 as i64)
+    }
+}
+
+impl LabelGroup for ThreadPoolWorkerId {
+    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
+        v.write_value(LabelName::from_str("worker"), self);
+    }
+}
+
+impl LabelSet for ThreadPoolWorkers {
+    type Value<'a> = ThreadPoolWorkerId;
+
+    fn dynamic_cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
+        (value.0 < self.0).then_some(value.0)
+    }
+
+    fn decode(&self, value: usize) -> Self::Value<'_> {
+        ThreadPoolWorkerId(value)
+    }
+}
+
+impl FixedCardinalitySet for ThreadPoolWorkers {
+    fn cardinality(&self) -> usize {
+        self.0
+    }
+}
+
+#[derive(MetricGroup)]
+#[metric(new(workers: usize))]
+pub struct ThreadPoolMetrics {
+    pub injector_queue_depth: Gauge,
+    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
+    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
+    pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
+}
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -6,11 +6,14 @@
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>

+mod countmin;
 mod exchange;
 mod key;
 mod messages;
+mod pbkdf2;
 mod secret;
 mod signature;
+pub mod threadpool;

 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
@@ -56,9 +59,13 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {

 #[cfg(test)]
 mod tests {
-    use crate::sasl::{Mechanism, Step};
+    use crate::{
+        intern::EndpointIdInt,
+        sasl::{Mechanism, Step},
+        EndpointId,
+    };

-    use super::{Exchange, ServerSecret};
+    use super::{threadpool::ThreadPool, Exchange, ServerSecret};

    #[test]
    fn snapshot() {
@@ -112,8 +119,13 @@ mod tests {
    }

    async fn run_round_trip_test(server_password: &str, client_password: &str) {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
        let scram_secret = ServerSecret::build(server_password).await.unwrap();
-        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
+        let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
            .await
            .unwrap();

--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -0,0 +1,173 @@
+use std::hash::Hash;
+
+/// estimator of hash jobs per second.
+/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
+pub struct CountMinSketch {
+    // one for each depth
+    hashers: Vec<ahash::RandomState>,
+    width: usize,
+    depth: usize,
+    // buckets, width*depth
+    buckets: Vec<u32>,
+}
+
+impl CountMinSketch {
+    /// Given parameters (ε, δ),
+    ///   set width = ceil(e/ε)
+    ///   set depth = ceil(ln(1/δ))
+    ///
+    /// guarantees:
+    /// actual <= estimate
+    /// estimate <= actual + ε * N with probability 1 - δ
+    /// where N is the cardinality of the stream
+    pub fn with_params(epsilon: f64, delta: f64) -> Self {
+        CountMinSketch::new(
+            (std::f64::consts::E / epsilon).ceil() as usize,
+            (1.0_f64 / delta).ln().ceil() as usize,
+        )
+    }
+
+    fn new(width: usize, depth: usize) -> Self {
+        Self {
+            #[cfg(test)]
+            hashers: (0..depth)
+                .map(|i| {
+                    // digits of pi for good randomness
+                    ahash::RandomState::with_seeds(
+                        314159265358979323,
+                        84626433832795028,
+                        84197169399375105,
+                        82097494459230781 + i as u64,
+                    )
+                })
+                .collect(),
+            #[cfg(not(test))]
+            hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
+            width,
+            depth,
+            buckets: vec![0; width * depth],
+        }
+    }
+
+    pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
+        let mut min = u32::MAX;
+        for row in 0..self.depth {
+            let col = (self.hashers[row].hash_one(t) as usize) % self.width;
+
+            let row = &mut self.buckets[row * self.width..][..self.width];
+            row[col] = row[col].saturating_add(x);
+            min = std::cmp::min(min, row[col]);
+        }
+        min
+    }
+
+    pub fn reset(&mut self) {
+        self.buckets.clear();
+        self.buckets.resize(self.width * self.depth, 0);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+
+    use super::CountMinSketch;
+
+    fn eval_precision(n: usize, p: f64, q: f64) -> usize {
+        // fixed value of phi for consistent test
+        let mut rng = StdRng::seed_from_u64(16180339887498948482);
+
+        #[allow(non_snake_case)]
+        let mut N = 0;
+
+        let mut ids = vec![];
+
+        for _ in 0..n {
+            // number of insert operations
+            let n = rng.gen_range(1..100);
+            // number to insert at once
+            let m = rng.gen_range(1..4096);
+
+            let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
+            ids.push((id, n, m));
+
+            // N = sum(actual)
+            N += n * m;
+        }
+
+        // q% of counts will be within p of the actual value
+        let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        dbg!(sketch.buckets.len());
+
+        // insert a bunch of entries in a random order
+        let mut ids2 = ids.clone();
+        while !ids2.is_empty() {
+            ids2.shuffle(&mut rng);
+
+            let mut i = 0;
+            while i < ids2.len() {
+                sketch.inc_and_return(&ids2[i].0, ids2[i].1);
+                ids2[i].2 -= 1;
+                if ids2[i].2 == 0 {
+                    ids2.remove(i);
+                } else {
+                    i += 1;
+                }
+            }
+        }
+
+        let mut within_p = 0;
+        for (id, n, m) in ids {
+            let actual = n * m;
+            let estimate = sketch.inc_and_return(&id, 0);
+
+            // This estimate has the guarantee that actual <= estimate
+            assert!(actual <= estimate);
+
+            // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
+            // ε = p / N, δ = 1 - q;
+            // therefore, estimate <= actual + p with probability q.
+            if estimate as f64 <= actual as f64 + p {
+                within_p += 1;
+            }
+        }
+        within_p
+    }
+
+    #[test]
+    fn precision() {
+        assert_eq!(eval_precision(100, 100.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
+
+        // seems to be more precise than the literature indicates?
+        // probably numbers are too small to truly represent the probabilities.
+        assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
+        assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
+        assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
+        assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
+    }
+
+    // returns memory usage in bytes, and the time complexity per insert.
+    fn eval_cost(p: f64, q: f64) -> (usize, usize) {
+        #[allow(non_snake_case)]
+        // N = sum(actual)
+        // Let's assume 1021 samples, all of 4096
+        let N = 1021 * 4096;
+        let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
+
+        let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
+        let time = sketch.depth;
+        (memory, time)
+    }
+
+    #[test]
+    fn memory_usage() {
+        assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
+        assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
+        assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
+        assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
+    }
+}
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -4,15 +4,17 @@ use std::convert::Infallible;

 use hmac::{Hmac, Mac};
 use sha2::Sha256;
-use tokio::task::yield_now;

 use super::messages::{
    ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
+use super::pbkdf2::Pbkdf2;
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
+use super::threadpool::ThreadPool;
 use super::ScramKey;
 use crate::config;
+use crate::intern::EndpointIdInt;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};

 /// The only channel binding mode we currently support.
@@ -74,37 +76,18 @@ impl<'a> Exchange<'a> {
    }
 }

-// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
-async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
-    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-    let mut prev = hmac
-        .clone()
-        .chain_update(salt)
-        .chain_update(1u32.to_be_bytes())
-        .finalize()
-        .into_bytes();
-
-    let mut hi = prev;
-
-    for i in 1..iterations {
-        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
-
-        for (hi, prev) in hi.iter_mut().zip(prev) {
-            *hi ^= prev;
-        }
-        // yield every ~250us
-        // hopefully reduces tail latencies
-        if i % 1024 == 0 {
-            yield_now().await
-        }
-    }
-
-    hi.into()
-}
-
 // copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
-async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
-    let salted_password = pbkdf2(password, salt, iterations).await;
+async fn derive_client_key(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
+    password: &[u8],
+    salt: &[u8],
+    iterations: u32,
+) -> ScramKey {
+    let salted_password = pool
+        .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
+        .await
+        .expect("job should not be cancelled");

    let make_key = |name| {
        let key = Hmac::<Sha256>::new_from_slice(&salted_password)
@@ -119,11 +102,13 @@ async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> Scr
 }

 pub async fn exchange(
+    pool: &ThreadPool,
+    endpoint: EndpointIdInt,
    secret: &ServerSecret,
    password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
    let salt = base64::decode(&secret.salt_base64)?;
-    let client_key = derive_client_key(password, &salt, secret.iterations).await;
+    let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;

    if secret.is_password_invalid(&client_key).into() {
        Ok(sasl::Outcome::Failure("password doesn't match"))
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -0,0 +1,89 @@
+use hmac::{
+    digest::{consts::U32, generic_array::GenericArray},
+    Hmac, Mac,
+};
+use sha2::Sha256;
+
+pub struct Pbkdf2 {
+    hmac: Hmac<Sha256>,
+    prev: GenericArray<u8, U32>,
+    hi: GenericArray<u8, U32>,
+    iterations: u32,
+}
+
+// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+impl Pbkdf2 {
+    pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
+        let hmac =
+            Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+
+        let prev = hmac
+            .clone()
+            .chain_update(salt)
+            .chain_update(1u32.to_be_bytes())
+            .finalize()
+            .into_bytes();
+
+        Self {
+            hmac,
+            // one consumed for the hash above
+            iterations: iterations - 1,
+            hi: prev,
+            prev,
+        }
+    }
+
+    pub fn cost(&self) -> u32 {
+        (self.iterations).clamp(0, 4096)
+    }
+
+    pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
+        let Self {
+            hmac,
+            prev,
+            hi,
+            iterations,
+        } = self;
+
+        // only do 4096 iterations per turn before sharing the thread for fairness
+        let n = (*iterations).clamp(0, 4096);
+        for _ in 0..n {
+            *prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
+
+            for (hi, prev) in hi.iter_mut().zip(*prev) {
+                *hi ^= prev;
+            }
+        }
+
+        *iterations -= n;
+        if *iterations == 0 {
+            std::task::Poll::Ready((*hi).into())
+        } else {
+            std::task::Poll::Pending
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Pbkdf2;
+    use pbkdf2::pbkdf2_hmac_array;
+    use sha2::Sha256;
+
+    #[test]
+    fn works() {
+        let salt = b"sodium chloride";
+        let pass = b"Ne0n_!5_50_C007";
+
+        let mut job = Pbkdf2::start(pass, salt, 600000);
+        let hash = loop {
+            let std::task::Poll::Ready(hash) = job.turn() else {
+                continue;
+            };
+            break hash;
+        };
+
+        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
+        assert_eq!(hash, expected)
+    }
+}
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -0,0 +1,321 @@
+//! Custom threadpool implementation for password hashing.
+//!
+//! Requirements:
+//! 1. Fairness per endpoint.
+//! 2. Yield support for high iteration counts.
+
+use std::sync::{
+    atomic::{AtomicU64, Ordering},
+    Arc,
+};
+
+use crossbeam_deque::{Injector, Stealer, Worker};
+use itertools::Itertools;
+use parking_lot::{Condvar, Mutex};
+use rand::Rng;
+use rand::{rngs::SmallRng, SeedableRng};
+use tokio::sync::oneshot;
+
+use crate::{
+    intern::EndpointIdInt,
+    metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
+    scram::countmin::CountMinSketch,
+};
+
+use super::pbkdf2::Pbkdf2;
+
+pub struct ThreadPool {
+    queue: Injector<JobSpec>,
+    stealers: Vec<Stealer<JobSpec>>,
+    parkers: Vec<(Condvar, Mutex<ThreadState>)>,
+    /// bitpacked representation.
+    /// lower 8 bits = number of sleeping threads
+    /// next 8 bits = number of idle threads (searching for work)
+    counters: AtomicU64,
+
+    pub metrics: Arc<ThreadPoolMetrics>,
+}
+
+#[derive(PartialEq)]
+enum ThreadState {
+    Parked,
+    Active,
+}
+
+impl ThreadPool {
+    pub fn new(n_workers: u8) -> Arc<Self> {
+        let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
+        let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
+
+        let parkers = (0..n_workers)
+            .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
+            .collect_vec();
+
+        let pool = Arc::new(Self {
+            queue: Injector::new(),
+            stealers,
+            parkers,
+            // threads start searching for work
+            counters: AtomicU64::new((n_workers as u64) << 8),
+            metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
+        });
+
+        for (i, worker) in workers.into_iter().enumerate() {
+            let pool = Arc::clone(&pool);
+            std::thread::spawn(move || thread_rt(pool, worker, i));
+        }
+
+        pool
+    }
+
+    pub fn spawn_job(
+        &self,
+        endpoint: EndpointIdInt,
+        pbkdf2: Pbkdf2,
+    ) -> oneshot::Receiver<[u8; 32]> {
+        let (tx, rx) = oneshot::channel();
+
+        let queue_was_empty = self.queue.is_empty();
+
+        self.metrics.injector_queue_depth.inc();
+        self.queue.push(JobSpec {
+            response: tx,
+            pbkdf2,
+            endpoint,
+        });
+
+        // inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
+        let counts = self.counters.load(Ordering::SeqCst);
+        let num_awake_but_idle = (counts >> 8) & 0xff;
+        let num_sleepers = counts & 0xff;
+
+        // If the queue is non-empty, then we always wake up a worker
+        // -- clearly the existing idle jobs aren't enough. Otherwise,
+        // check to see if we have enough idle workers.
+        if !queue_was_empty || num_awake_but_idle == 0 {
+            let num_to_wake = Ord::min(1, num_sleepers);
+            self.wake_any_threads(num_to_wake);
+        }
+
+        rx
+    }
+
+    #[cold]
+    fn wake_any_threads(&self, mut num_to_wake: u64) {
+        if num_to_wake > 0 {
+            for i in 0..self.parkers.len() {
+                if self.wake_specific_thread(i) {
+                    num_to_wake -= 1;
+                    if num_to_wake == 0 {
+                        return;
+                    }
+                }
+            }
+        }
+    }
+
+    fn wake_specific_thread(&self, index: usize) -> bool {
+        let (condvar, lock) = &self.parkers[index];
+
+        let mut state = lock.lock();
+        if *state == ThreadState::Parked {
+            condvar.notify_one();
+
+            // When the thread went to sleep, it will have incremented
+            // this value. When we wake it, its our job to decrement
+            // it. We could have the thread do it, but that would
+            // introduce a delay between when the thread was
+            // *notified* and when this counter was decremented. That
+            // might mislead people with new work into thinking that
+            // there are sleeping threads that they should try to
+            // wake, when in fact there is nothing left for them to
+            // do.
+            self.counters.fetch_sub(1, Ordering::SeqCst);
+            *state = ThreadState::Active;
+
+            true
+        } else {
+            false
+        }
+    }
+
+    fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
+        // announce thread as idle
+        self.counters.fetch_add(256, Ordering::SeqCst);
+
+        // try steal from the global queue
+        loop {
+            match self.queue.steal_batch_and_pop(worker) {
+                crossbeam_deque::Steal::Success(job) => {
+                    self.metrics
+                        .injector_queue_depth
+                        .set(self.queue.len() as i64);
+                    // no longer idle
+                    self.counters.fetch_sub(256, Ordering::SeqCst);
+                    return Some(job);
+                }
+                crossbeam_deque::Steal::Retry => continue,
+                crossbeam_deque::Steal::Empty => break,
+            }
+        }
+
+        // try steal from our neighbours
+        loop {
+            let mut retry = false;
+            let start = rng.gen_range(0..self.stealers.len());
+            let job = (start..self.stealers.len())
+                .chain(0..start)
+                .filter(|i| *i != skip)
+                .find_map(
+                    |victim| match self.stealers[victim].steal_batch_and_pop(worker) {
+                        crossbeam_deque::Steal::Success(job) => Some(job),
+                        crossbeam_deque::Steal::Empty => None,
+                        crossbeam_deque::Steal::Retry => {
+                            retry = true;
+                            None
+                        }
+                    },
+                );
+            if job.is_some() {
+                // no longer idle
+                self.counters.fetch_sub(256, Ordering::SeqCst);
+                return job;
+            }
+            if !retry {
+                return None;
+            }
+        }
+    }
+}
+
+fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
+    /// interval when we should steal from the global queue
+    /// so that tail latencies are managed appropriately
+    const STEAL_INTERVAL: usize = 61;
+
+    /// How often to reset the sketch values
+    const SKETCH_RESET_INTERVAL: usize = 1021;
+
+    let mut rng = SmallRng::from_entropy();
+
+    // used to determine whether we should temporarily skip tasks for fairness.
+    // 99% of estimates will overcount by no more than 4096 samples
+    let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
+
+    let (condvar, lock) = &pool.parkers[index];
+
+    'wait: loop {
+        // wait for notification of work
+        {
+            let mut lock = lock.lock();
+
+            // queue is empty
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), 0);
+
+            // subtract 1 from idle count, add 1 to sleeping count.
+            pool.counters.fetch_sub(255, Ordering::SeqCst);
+
+            *lock = ThreadState::Parked;
+            condvar.wait(&mut lock);
+        }
+
+        for i in 0.. {
+            let mut job = match worker
+                .pop()
+                .or_else(|| pool.steal(&mut rng, index, &worker))
+            {
+                Some(job) => job,
+                None => continue 'wait,
+            };
+
+            pool.metrics
+                .worker_queue_depth
+                .set(ThreadPoolWorkerId(index), worker.len() as i64);
+
+            // receiver is closed, cancel the task
+            if !job.response.is_closed() {
+                let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
+
+                const P: f64 = 2000.0;
+                // probability decreases as rate increases.
+                // lower probability, higher chance of being skipped
+                //
+                // estimates (rate in terms of 4096 rounds):
+                // rate = 0    => probability = 100%
+                // rate = 10   => probability = 71.3%
+                // rate = 50   => probability = 62.1%
+                // rate = 500  => probability = 52.3%
+                // rate = 1021 => probability = 49.8%
+                //
+                // My expectation is that the pool queue will only begin backing up at ~1000rps
+                // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
+                // are in requests per second.
+                let probability = P.ln() / (P + rate as f64).ln();
+                if pool.queue.len() > 32 || rng.gen_bool(probability) {
+                    pool.metrics
+                        .worker_task_turns_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    match job.pbkdf2.turn() {
+                        std::task::Poll::Ready(result) => {
+                            let _ = job.response.send(result);
+                        }
+                        std::task::Poll::Pending => worker.push(job),
+                    }
+                } else {
+                    pool.metrics
+                        .worker_task_skips_total
+                        .inc(ThreadPoolWorkerId(index));
+
+                    // skip for now
+                    worker.push(job)
+                }
+            }
+
+            // if we get stuck with a few long lived jobs in the queue
+            // it's better to try and steal from the queue too for fairness
+            if i % STEAL_INTERVAL == 0 {
+                let _ = pool.queue.steal_batch(&worker);
+            }
+
+            if i % SKETCH_RESET_INTERVAL == 0 {
+                sketch.reset();
+            }
+        }
+    }
+}
+
+struct JobSpec {
+    response: oneshot::Sender<[u8; 32]>,
+    pbkdf2: Pbkdf2,
+    endpoint: EndpointIdInt,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::EndpointId;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn hash_is_correct() {
+        let pool = ThreadPool::new(1);
+
+        let ep = EndpointId::from("foo");
+        let ep = EndpointIdInt::from(ep);
+
+        let salt = [0x55; 32];
+        let actual = pool
+            .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
+            .await
+            .unwrap();
+
+        let expected = [
+            10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
+            178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
+        ];
+        assert_eq!(actual, expected)
+    }
+}
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -15,6 +15,7 @@ use crate::{
    },
    context::RequestMonitoring,
    error::{ErrorKind, ReportableError, UserFacingError},
+    intern::EndpointIdInt,
    proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
    rate_limiter::EndpointRateLimiter,
    Host,
@@ -66,8 +67,14 @@ impl PoolingBackend {
                return Err(AuthError::auth_failed(&*user_info.user));
            }
        };
-        let auth_outcome =
-            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
+        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
+        let auth_outcome = crate::auth::validate_password_and_exchange(
+            &config.thread_pool,
+            ep,
+            &conn_info.password,
+            secret,
+        )
+        .await?;
        let res = match auth_outcome {
            crate::sasl::Outcome::Success(key) => {
                info!("user successfully authenticated");
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -51,9 +51,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
    ) -> Poll<io::Result<usize>> {
        let this = self.project();
        let mut stream = this.stream;
-        this.send.put(buf);

        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+
+        this.send.put(buf);
        match stream.as_mut().start_send(Frame::binary(this.send.split())) {
            Ok(()) => Poll::Ready(Ok(buf.len())),
            Err(e) => Poll::Ready(Err(io_error(e))),
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet};

 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
@@ -208,7 +208,7 @@ impl TenantObjectListing {
        &mut self,
        timeline_id: TimelineId,
        layer_file: &LayerName,
-        metadata: &IndexLayerMetadata,
+        metadata: &LayerFileMetadata,
    ) -> bool {
        let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
            return false;
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -11,7 +11,7 @@ use async_stream::stream;
 use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
@@ -49,8 +49,8 @@ impl SnapshotDownloader {
        &self,
        ttid: TenantShardTimelineId,
        layer_name: LayerName,
-        layer_metadata: IndexLayerMetadata,
-    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
+        layer_metadata: LayerFileMetadata,
+    ) -> anyhow::Result<(LayerName, LayerFileMetadata)> {
        // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
        // different layer names (remote-style has the generation suffix)
        let local_path = self.output_path.join(format!(
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
    async fn download_layers(
        &self,
        ttid: TenantShardTimelineId,
-        layers: Vec<(LayerName, IndexLayerMetadata)>,
+        layers: Vec<(LayerName, LayerFileMetadata)>,
    ) -> anyhow::Result<()> {
        let layer_count = layers.len();
        tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -161,10 +161,7 @@ impl SnapshotDownloader {
        ttid: TenantShardTimelineId,
        index_part: Box<IndexPart>,
        index_part_generation: Generation,
-        ancestor_layers: &mut HashMap<
-            TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
-        >,
+        ancestor_layers: &mut HashMap<TenantShardTimelineId, HashMap<LayerName, LayerFileMetadata>>,
    ) -> anyhow::Result<()> {
        let index_bytes = serde_json::to_string(&index_part).unwrap();

@@ -234,7 +231,7 @@ impl SnapshotDownloader {
        // happen if this tenant has been split at some point)
        let mut ancestor_layers: HashMap<
            TenantShardTimelineId,
-            HashMap<LayerName, IndexLayerMetadata>,
+            HashMap<LayerName, LayerFileMetadata>,
        > = Default::default();

        for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -37,17 +37,11 @@ use crate::{
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
 pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    info!("started");
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };

+    let cancel = tli.cancel.clone();
    select! {
        _ = recovery_main_loop(tli, conf) => { unreachable!() }
-        _ = cancellation_rx.changed() => {
+        _ = cancel.cancelled() => {
            info!("stopped");
        }
    }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -827,10 +827,10 @@ where

    /// Persist control file if there is something to save and enough time
    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
+    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<bool> {
        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
        if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(());
+            return Ok(false);
        }
        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
            || self.state.inmem.backup_lsn > self.state.backup_lsn
@@ -840,7 +840,7 @@ where
            self.state.flush().await?;
            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
        }
-        Ok(())
+        Ok(need_persist)
    }

    /// Handle request to append WAL.
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -6,6 +6,7 @@ use camino::Utf8PathBuf;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
+use tokio_util::sync::CancellationToken;

 use std::cmp::max;
 use std::ops::{Deref, DerefMut};
@@ -103,11 +104,16 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
 pub struct WriteGuardSharedState<'a> {
    tli: Arc<Timeline>,
    guard: RwLockWriteGuard<'a, SharedState>,
+    skip_update: bool,
 }

 impl<'a> WriteGuardSharedState<'a> {
    fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState { tli, guard }
+        WriteGuardSharedState {
+            tli,
+            guard,
+            skip_update: false,
+        }
    }
 }

@@ -148,10 +154,12 @@ impl<'a> Drop for WriteGuardSharedState<'a> {
            }
        });

-        // send notification about shared state update
-        self.tli.shared_state_version_tx.send_modify(|old| {
-            *old += 1;
-        });
+        if !self.skip_update {
+            // send notification about shared state update
+            self.tli.shared_state_version_tx.send_modify(|old| {
+                *old += 1;
+            });
+        }
    }
 }

@@ -342,12 +350,8 @@ pub struct Timeline {
    walsenders: Arc<WalSenders>,
    walreceivers: Arc<WalReceivers>,

-    /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
-    cancellation_tx: watch::Sender<bool>,
-
-    /// Timeline should not be used after cancellation. Background tasks should
-    /// monitor this channel and stop eventually after receiving `true` from this channel.
-    cancellation_rx: watch::Receiver<bool>,
+    /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
+    pub(crate) cancel: CancellationToken,

    /// Directory where timeline state is stored.
    pub timeline_dir: Utf8PathBuf,
@@ -376,7 +380,6 @@ impl Timeline {
            shared_state.sk.flush_lsn(),
        )));
        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);

        let walreceivers = WalReceivers::new();
        Ok(Timeline {
@@ -390,8 +393,7 @@ impl Timeline {
            mutex: RwLock::new(shared_state),
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
            timeline_dir: conf.timeline_dir(&ttid),
            walsenders_keep_horizon: conf.walsenders_keep_horizon,
            broker_active: AtomicBool::new(false),
@@ -411,7 +413,6 @@ impl Timeline {
        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
            watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-        let (cancellation_tx, cancellation_rx) = watch::channel(false);

        let state =
            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
@@ -428,8 +429,7 @@ impl Timeline {
            mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
-            cancellation_rx,
-            cancellation_tx,
+            cancel: CancellationToken::default(),
            timeline_dir: conf.timeline_dir(&ttid),
            walsenders_keep_horizon: conf.walsenders_keep_horizon,
            broker_active: AtomicBool::new(false),
@@ -535,7 +535,7 @@ impl Timeline {
    /// eventually after receiving cancellation signal.
    fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
        info!("timeline {} is cancelled", self.ttid);
-        let _ = self.cancellation_tx.send(true);
+        self.cancel.cancel();
        // Close associated FDs. Nobody will be able to touch timeline data once
        // it is cancelled, so WAL storage won't be opened again.
        shared_state.sk.wal_store.close();
@@ -543,17 +543,7 @@ impl Timeline {

    /// Returns if timeline is cancelled.
    pub fn is_cancelled(&self) -> bool {
-        *self.cancellation_rx.borrow()
-    }
-
-    /// Returns watch channel which gets value when timeline is cancelled. It is
-    /// guaranteed to have not cancelled value observed (errors otherwise).
-    pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
-        let rx = self.cancellation_rx.clone();
-        if *rx.borrow() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        Ok(rx)
+        self.cancel.is_cancelled()
    }

    /// Take a writing mutual exclusive lock on timeline shared_state.
@@ -819,7 +809,11 @@ impl Timeline {

        // update last_removed_segno
        let mut shared_state = self.write_shared_state().await;
-        shared_state.last_removed_segno = horizon_segno;
+        if shared_state.last_removed_segno != horizon_segno {
+            shared_state.last_removed_segno = horizon_segno;
+        } else {
+            shared_state.skip_update = true;
+        }
        Ok(())
    }

@@ -828,11 +822,10 @@ impl Timeline {
    /// to date so that storage nodes restart doesn't cause many pageserver ->
    /// safekeeper reconnections.
    pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
-        self.write_shared_state()
-            .await
-            .sk
-            .maybe_persist_inmem_control_file()
-            .await
+        let mut guard = self.write_shared_state().await;
+        let changed = guard.sk.maybe_persist_inmem_control_file().await?;
+        guard.skip_update = !changed;
+        Ok(())
    }

    /// Gather timeline data for metrics.
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -47,14 +47,6 @@ pub async fn main_task(
    conf: SafeKeeperConf,
    broker_active_set: Arc<TimelinesSet>,
 ) {
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
    scopeguard::defer! {
        if tli.is_cancelled() {
            info!("manager task finished");
@@ -129,7 +121,7 @@ pub async fn main_task(
        // wait until something changes. tx channels are stored under Arc, so they will not be
        // dropped until the manager task is finished.
        tokio::select! {
-            _ = cancellation_rx.changed() => {
+            _ = tli.cancel.cancelled() => {
                // timeline was deleted
                break 'outer state_snapshot;
            }
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -277,14 +277,6 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;

-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
    // sleep for random time to avoid thundering herd
    {
        let randf64 = rand::thread_rng().gen_range(0.0..1.0);
@@ -327,7 +319,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
                && flush_lsn_rx.borrow().term == seg.term
            {
                tokio::select! {
-                    _ = cancellation_rx.changed() => {
+                    _ = backup.tli.cancel.cancelled() => {
                        info!("timeline canceled");
                        return;
                    }
@@ -340,7 +332,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
        // if we don't have any data and zero LSNs, wait for something
        while flush_lsn_rx.borrow().lsn == Lsn(0) {
            tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                    info!("timeline canceled");
                    return;
                }
@@ -357,7 +349,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
        // waiting until timeout expires OR segno changes
        'inner: loop {
            tokio::select! {
-                _ = cancellation_rx.changed() => {
+                _ = backup.tli.cancel.cancelled() => {
                    info!("timeline canceled");
                    return;
                }
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1625,7 +1625,7 @@ class NeonCli(AbstractNeonCli):
            args.extend(["-c", "switch_aux_file_policy:v1"])

        if aux_file_v2 is AuxFileStore.CrossValidation:
-            args.extend(["-c", "switch_aux_file_policy:cross_validation"])
+            args.extend(["-c", "switch_aux_file_policy:cross-validation"])

        if set_default:
            args.append("--set-default")
@@ -2667,7 +2667,9 @@ class NeonPageserver(PgProtocol, LogUtils):
            tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
        )

-    def list_layers(self, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    def list_layers(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ) -> list[Path]:
        """
        Inspect local storage on a pageserver to discover which layer files are present.

@@ -2788,6 +2790,28 @@ class PgBin:
        log.info(f"last checkpoint at {checkpoint_lsn}")
        return Lsn(checkpoint_lsn)

+    def take_fullbackup(
+        self,
+        pageserver: NeonPageserver,
+        tenant: TenantId,
+        timeline: TimelineId,
+        lsn: Lsn,
+        output: Path,
+    ):
+        """
+        Request fullbackup from pageserver, store it at 'output'.
+        """
+        cmd = [
+            "psql",
+            "--no-psqlrc",
+            pageserver.connstr(),
+            "-c",
+            f"fullbackup {tenant} {timeline} {lsn}",
+            "-o",
+            str(output),
+        ]
+        self.run_capture(cmd)
+

@pytest.fixture(scope="function")
 def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin:
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -70,6 +70,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    # this is expected given our collaborative shutdown approach for the UploadQueue
    ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
    ".*Compaction failed.*, retrying in .*: ShuttingDown",
+    ".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*",
    # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
    ".*Error processing HTTP request: NotFound: Timeline .* was not found",
    ".*took more than expected to complete.*",
@@ -91,6 +92,10 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*",
    # Can happen when the test shuts down the storage controller while it is calling the utilization API
    ".*WARN.*path=/v1/utilization .*request was dropped before completing",
+    # Can happen during shutdown
+    ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Can happen during shutdown
+    ".*ignoring failure to find gc cutoffs: timeline shutting down.*",
 )


--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -4,10 +4,13 @@ import json
 import os
 import re
 import subprocess
+import tarfile
 import threading
 import time
+from hashlib import sha256
 from pathlib import Path
 from typing import (
+    IO,
    TYPE_CHECKING,
    Any,
    Callable,
@@ -15,8 +18,10 @@ from typing import (
    Iterable,
    List,
    Optional,
+    Set,
    Tuple,
    TypeVar,
+    Union,
 )
 from urllib.parse import urlencode

@@ -490,12 +495,68 @@ def assert_no_errors(log_file, service, allowed_errors):

@enum.unique
 class AuxFileStore(str, enum.Enum):
-    V1 = "V1"
-    V2 = "V2"
-    CrossValidation = "CrossValidation"
+    V1 = "v1"
+    V2 = "v2"
+    CrossValidation = "cross-validation"

    def __repr__(self) -> str:
        return f"'aux-{self.value}'"

    def __str__(self) -> str:
        return f"'aux-{self.value}'"
+
+
+def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]):
+    """
+    This is essentially:
+
+    lines=$(comm -3 \
+        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        | wc -l)
+    [ "$lines" = "0" ]
+
+    But in a more mac friendly fashion.
+    """
+    started_at = time.time()
+
+    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
+        assert reader is not None
+        digest = sha256(usedforsecurity=False)
+        while True:
+            buf = reader.read(64 * 1024)
+            if not buf:
+                break
+            digest.update(buf)
+        return digest.digest()
+
+    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
+        with tarfile.open(p) as f:
+            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
+            ret = list(
+                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
+            )
+            ret.sort(key=lambda t: t[0])
+            return ret
+
+    left_list, right_list = map(build_hash_list, [left, right])
+
+    assert len(left_list) == len(
+        right_list
+    ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}"
+
+    mismatching = set()
+
+    for left_tuple, right_tuple in zip(left_list, right_list):
+        left_path, left_hash = left_tuple
+        right_path, right_hash = right_tuple
+        assert (
+            left_path == right_path
+        ), f"file count matched, expected these to be same paths: {left_path}, {right_path}"
+        if left_hash != right_hash:
+            mismatching.add(left_path)
+
+    assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}"
+
+    elapsed = time.time() - started_at
+    log.info(f"assert_pageserver_backups_equal completed in {elapsed}s")
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -17,9 +17,13 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
    env = neon_env_builder.init_start()

-    # eviction might be the first one after an attach to access the layers
-    env.pageserver.allowed_errors.append(
-        ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction"
+    env.pageserver.allowed_errors.extend(
+        [
+            # eviction might be the first one after an attach to access the layers
+            ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
+            # detach can happen before we get to validate the generation number
+            ".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
+        ]
    )
    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
    return env
@@ -162,7 +166,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "checkpoint_distance": 10000,
        "checkpoint_timeout": "13m",
        "compaction_algorithm": {
-            "kind": "Tiered",
+            "kind": "tiered",
        },
        "eviction_policy": {
            "kind": "LayerAccessThreshold",
@@ -190,7 +194,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "trace_read_requests": True,
        "walreceiver_connect_timeout": "13m",
        "image_layer_creation_check_threshold": 1,
-        "switch_aux_file_policy": "CrossValidation",
+        "switch_aux_file_policy": "cross-validation",
    }

    ps_http = env.pageserver.http_client()
--- a/test_runner/regress/test_aux_files.py
+++ b/test_runner/regress/test_aux_files.py
@@ -1,5 +1,6 @@
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
+    AuxFileStore,
    NeonEnvBuilder,
    logical_replication_sync,
 )
@@ -14,7 +15,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
    timeline_id = env.initial_timeline

    tenant_config = client.tenant_config(tenant_id).effective_config
-    tenant_config["switch_aux_file_policy"] = "V2"
+    tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
    client.set_tenant_config(tenant_id, tenant_config)
    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
    assert (
@@ -49,7 +50,10 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):

    with env.pageserver.http_client() as client:
        # aux file v2 flag should be enabled at this point
-        assert client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] == "V2"
+        assert (
+            client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
+            == AuxFileStore.V2
+        )
    with env.pageserver.http_client() as client:
        tenant_config = client.tenant_config(tenant_id).effective_config
        tenant_config["switch_aux_file_policy"] = "V1"
@@ -59,7 +63,7 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
                "last_aux_file_policy"
            ]
-            == "V2"
+            == AuxFileStore.V2
        )
    env.pageserver.restart()
    with env.pageserver.http_client() as client:
@@ -68,5 +72,5 @@ def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
                "last_aux_file_policy"
            ]
-            == "V2"
+            == AuxFileStore.V2
        )
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -194,8 +194,8 @@ def test_sharding_compaction(


 class CompactionAlgorithm(str, enum.Enum):
-    LEGACY = "Legacy"
-    TIERED = "Tiered"
+    LEGACY = "legacy"
+    TIERED = "tiered"


@pytest.mark.parametrize(
--- a/test_runner/regress/test_fullbackup.py
+++ b/test_runner/regress/test_fullbackup.py
@@ -1,7 +1,7 @@
 import os
 from pathlib import Path

-from fixtures.common_types import Lsn, TimelineId
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
@@ -19,17 +19,16 @@ def test_fullbackup(
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
    port_distributor: PortDistributor,
-    pg_distrib_dir: Path,
    test_output_dir: Path,
 ):
    env = neon_env_builder.init_start()

-    env.neon_cli.create_branch("test_fullbackup")
-    endpoint_main = env.endpoints.create_start("test_fullbackup")
+    # endpoint needs to be alive until the fullbackup so that we have
+    # prev_record_lsn for the vanilla_pg to start in read-write mode
+    # for some reason this does not happen if endpoint is shutdown.
+    endpoint_main = env.endpoints.create_start("main")

    with endpoint_main.cursor() as cur:
-        timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
-
        # data loading may take a while, so increase statement timeout
        cur.execute("SET statement_timeout='300s'")
        cur.execute(
@@ -41,17 +40,13 @@ def test_fullbackup(
        lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
        log.info(f"start_backup_lsn = {lsn}")

-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
    # Get and unpack fullbackup from pageserver
    restored_dir_path = env.repo_dir / "restored_datadir"
    os.mkdir(restored_dir_path, 0o750)
-    query = f"fullbackup {env.initial_tenant} {timeline} {lsn}"
    tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file
+    )
    subprocess_capture(
        env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)]
    )
@@ -61,7 +56,7 @@ def test_fullbackup(
    # use resetwal to overwrite it
    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
    cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)

    # Restore from the backup and find the data we inserted
    port = port_distributor.get_port()
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,3 +1,4 @@
+import asyncio
 import os
 import re
 import threading
@@ -292,3 +293,62 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
            assert slot_xmin is None

        wait_until(10, 1.0, xmin_is_null)
+
+
+# Test race condition between WAL replay and backends performing queries
+# https://github.com/neondatabase/neon/issues/7791
+def test_replica_query_race(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    primary_ep = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    )
+
+    with primary_ep.connect() as p_con:
+        with p_con.cursor() as p_cur:
+            p_cur.execute("CREATE EXTENSION neon_test_utils")
+            p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")
+
+    standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
+    time.sleep(1)
+
+    # In primary, run a lot of UPDATEs on a single page
+    finished = False
+    writecounter = 1
+
+    async def primary_workload():
+        nonlocal writecounter, finished
+        conn = await primary_ep.connect_async()
+        while writecounter < 10000:
+            writecounter += 1
+            await conn.execute(f"UPDATE test SET counter = {writecounter}")
+        finished = True
+
+    # In standby, at the same time, run queries on it. And repeatedly drop caches
+    async def standby_workload():
+        nonlocal writecounter, finished
+        conn = await standby_ep.connect_async()
+        reads = 0
+        while not finished:
+            readcounter = await conn.fetchval("SELECT counter FROM test")
+
+            # Check that the replica is keeping up with the primary. In local
+            # testing, the lag between primary and standby is much smaller, in
+            # the ballpark of 2-3 counter values. But be generous in case there's
+            # some hiccup.
+            # assert(writecounter - readcounter < 1000)
+            assert readcounter <= writecounter
+            if reads % 100 == 0:
+                log.info(f"read {reads}: counter {readcounter}, last update {writecounter}")
+            reads += 1
+
+            await conn.execute("SELECT clear_buffer_cache()")
+
+    async def both():
+        await asyncio.gather(
+            primary_workload(),
+            standby_workload(),
+        )
+
+    asyncio.run(both())
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -21,7 +21,7 @@ from fixtures.pageserver.utils import (
    wait_for_upload,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.utils import subprocess_capture
+from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture


 def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
@@ -163,7 +163,7 @@ def test_import_from_pageserver_small(

    num_rows = 3000
    lsn = _generate_data(num_rows, endpoint)
-    _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir)
+    _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)


@pytest.mark.timeout(1800)
@@ -193,9 +193,7 @@ def test_import_from_pageserver_multisegment(
    log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
    assert logical_size > 1024**3  # = 1GB

-    tar_output_file = _import(
-        num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir
-    )
+    tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir)

    # Check if the backup data contains multiple segment files
    cnt_seg_files = 0
@@ -235,7 +233,6 @@ def _import(
    env: NeonEnv,
    pg_bin: PgBin,
    timeline: TimelineId,
-    pg_distrib_dir: Path,
    test_output_dir: Path,
 ) -> Path:
    """Test importing backup data to the pageserver.
@@ -248,15 +245,9 @@ def _import(
    path to the backup archive file"""
    log.info(f"start_backup_lsn = {lsn}")

-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
    # Get a fullbackup from pageserver
-    query = f"fullbackup { env.initial_tenant} {timeline} {lsn}"
    tar_output_file = test_output_dir / "fullbackup.tar"
-    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file)

    # Stop the first pageserver instance, erase all its data
    env.endpoints.stop_all()
@@ -301,26 +292,15 @@ def _import(
    wait_for_upload(client, tenant, timeline, lsn)

    # Check it worked
-    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant)
+    endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn)
    assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)]

    # Take another fullbackup
-    query = f"fullbackup { tenant} {timeline} {lsn}"
    new_tar_output_file = test_output_dir / "fullbackup-new.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        query,
-        "-o",
-        str(new_tar_output_file),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file)

    # Check it's the same as the first fullbackup
-    # TODO pageserver should be checking checksum
-    assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
+    assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set())

    # Check that gc works
    pageserver_http = env.pageserver.http_client()
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -5,7 +5,7 @@ from pathlib import Path

 from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_wal_insert_lsn
+from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn
 from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
 )
@@ -71,22 +71,17 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder):
 def test_import_at_2bil(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
-    pg_distrib_dir: Path,
-    pg_bin,
+    pg_bin: PgBin,
    vanilla_pg,
 ):
    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
    env = neon_env_builder.init_start()
    ps_http = env.pageserver.http_client()

-    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
-    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
    # Reset the vanilla Postgres instance to somewhat before 2 billion transactions.
    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
    cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.run_capture(cmd)

    vanilla_pg.start()
    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -578,7 +578,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
    default_download_period_secs = 60

    # The upload period, which will also be the download once the secondary has seen its first heatmap
-    upload_period_secs = 20
+    upload_period_secs = 30

    for _i in range(0, tenant_count):
        tenant_id = TenantId.generate()
@@ -596,11 +596,26 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):

        tenant_timelines[tenant_id] = [timeline_a, timeline_b]

+    def await_log(pageserver, deadline, expression):
+        """
+        Wrapper around assert_log_contains that waits with a deadline rather than timeout
+        """
+        now = time.time()
+        if now > deadline:
+            raise RuntimeError(f"Timed out waiting for {expression}")
+        else:
+            timeout = int(deadline - now) + 1
+            try:
+                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))  # type: ignore
+            except:
+                log.error(f"Timed out waiting for '{expression}'")
+                raise
+
    t_start = time.time()

    # Wait long enough that the background downloads should happen; we expect all the inital layers
    # of all the initial timelines to show up on the secondary location of each tenant.
-    time.sleep(default_download_period_secs * 1.5)
+    initial_download_deadline = time.time() + default_download_period_secs * 3

    for tenant_id, timelines in tenant_timelines.items():
        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -608,8 +623,24 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
        # We only have two: the other one must be secondary
        ps_secondary = next(p for p in env.pageservers if p != ps_attached)

+        now = time.time()
+        if now > initial_download_deadline:
+            raise RuntimeError("Timed out waiting for initial secondary download")
+        else:
+            for timeline_id in timelines:
+                log.info(
+                    f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}"
+                )
+                await_log(
+                    ps_secondary,
+                    initial_download_deadline,
+                    f".*{timeline_id}.*Wrote timeline_detail.*",
+                )
+
        for timeline_id in timelines:
-            log.info(f"Checking for secondary timeline {timeline_id} on node {ps_secondary.id}")
+            log.info(
+                f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}"
+            )
            # One or more layers should be present for all timelines
            assert ps_secondary.list_layers(tenant_id, timeline_id)

@@ -617,7 +648,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
        env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])

    # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
-    time.sleep(upload_period_secs * 2.5)
+    deletion_deadline = time.time() + upload_period_secs * 3

    for tenant_id, timelines in tenant_timelines.items():
        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -625,6 +656,16 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
        # We only have two: the other one must be secondary
        ps_secondary = next(p for p in env.pageservers if p != ps_attached)

+        expect_del_timeline = timelines[1]
+        log.info(
+            f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}"
+        )
+        await_log(
+            ps_secondary,
+            deletion_deadline,
+            f".*Timeline no longer in heatmap.*{expect_del_timeline}.*",
+        )
+
        # This one was not deleted
        assert ps_secondary.list_layers(tenant_id, timelines[0])

@@ -632,7 +673,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
        log.info(
            f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
        )
-        assert not ps_secondary.list_layers(tenant_id, timelines[1])
+        assert not ps_secondary.list_layers(tenant_id, expect_del_timeline)

    t_end = time.time()

--- a/test_runner/regress/test_pg_query_cancellation.py
+++ b/test_runner/regress/test_pg_query_cancellation.py
@@ -0,0 +1,282 @@
+from contextlib import closing
+from typing import Set
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver
+from fixtures.pageserver.http import PageserverHttpClient
+from psycopg2.errors import QueryCanceled
+
+CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+    "ps::handle-pagerequest-message::exists",
+    "ps::handle-pagerequest-message::nblocks",
+    "ps::handle-pagerequest-message::getpage",
+    "ps::handle-pagerequest-message::dbsize",
+    # We don't yet have a good way to on-demand guarantee the download of an
+    # SLRU segment, so that's disabled for now.
+    # "ps::handle-pagerequest-message::slrusegment",
+}
+
+PG_PS_START_FAILPOINTS = {
+    "ps::connection-start::pre-login",
+    "ps::connection-start::startup-packet",
+    "ps::connection-start::process-query",
+}
+SMGR_EXISTS = "ps::handle-pagerequest-message::exists"
+SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks"
+SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage"
+SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize"
+
+"""
+Test that we can handle connection delays and cancellations at various
+unfortunate connection startup and request states.
+"""
+
+
+def test_cancellations(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    ps = env.pageserver
+    ps_http = ps.http_client()
+    ps_http.is_testing_enabled_or_skip()
+
+    env.neon_cli.create_branch("test_config", "empty")
+
+    # We don't want to have any racy behaviour with autovacuum IOs
+    ep = env.endpoints.create_start(
+        "test_config",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers = 128MB",
+        ],
+    )
+
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                CREATE TABLE test1 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1, 1024::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                CREATE TABLE test2 AS
+                    SELECT id, sha256(id::text::bytea) payload
+                    FROM generate_series(1025, 2048::bigint) p(id);
+                """
+            )
+            cur.execute(
+                """
+                VACUUM (ANALYZE, FREEZE) test1, test2;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_buffercache;
+                """
+            )
+            cur.execute(
+                """
+                CREATE EXTENSION pg_prewarm;
+                """
+            )
+
+    # data preparation is now complete, with 2 disjoint tables that aren't
+    # preloaded into any caches.
+
+    ep.stop()
+
+    for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS:
+        connect_works_correctly(failpoint, ep, ps, ps_http)
+
+
+ENABLED_FAILPOINTS: Set[str] = set()
+
+
+def connect_works_correctly(
+    failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient
+):
+    log.debug("Starting work on %s", failpoint)
+    # All queries we use should finish (incl. IO) within 500ms,
+    # including all their IO.
+    # This allows us to use `SET statement_timeout` to let the query
+    # timeout system cancel queries, rather than us having to go
+    # through the most annoying effort of manual query cancellation
+    # in psycopg2.
+    options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1"
+
+    ep.start()
+
+    def fp_enable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "pause"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint}
+        log.info(
+            'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2
+        )
+
+    def fp_disable():
+        global ENABLED_FAILPOINTS
+        ps_http.configure_failpoints(
+            [
+                (failpoint, "off"),
+            ]
+        )
+        ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint}
+        log.info(
+            'Disabled failpoint "%s", current_active=%s',
+            failpoint,
+            ENABLED_FAILPOINTS,
+            stacklevel=2,
+        )
+
+    def check_buffers(cur):
+        cur.execute(
+            """
+            SELECT n.nspname AS nspname
+                 , c.relname AS relname
+                 , count(*)  AS count
+            FROM pg_buffercache b
+            JOIN pg_class c
+              ON b.relfilenode = pg_relation_filenode(c.oid) AND
+                 b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database())
+            JOIN pg_namespace n ON n.oid = c.relnamespace
+            WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid)
+            GROUP BY n.nspname, c.relname
+            ORDER BY 3 DESC
+            LIMIT 10
+            """
+        )
+        return cur.fetchone()
+
+    def exec_may_cancel(query, cursor, result, cancels):
+        if cancels:
+            with pytest.raises(QueryCanceled):
+                cursor.execute(query)
+                assert cursor.fetchone() == result
+        else:
+            cursor.execute(query)
+            assert cursor.fetchone() == result
+
+    fp_disable()
+
+    # Warm caches required for new connections, so that they can run without
+    # requiring catalog reads.
+    with closing(ep.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT 1;
+                """
+            )
+            assert cur.fetchone() == (1,)
+
+            assert check_buffers(cur) is None
+            # Ensure all caches required for connection start are correctly
+            # filled, so that we don't have any "accidents" in this test run
+            # caused by changes in connection startup plans that require
+            # requests to the PageServer.
+            cur.execute(
+                """
+                select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0))
+                from pg_class c
+                where c.oid < 16384 AND c.relkind IN ('i', 'r');
+                """
+            )
+            assert cur.fetchone() == ([True],)
+
+    # Enable failpoint
+    fp_enable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            cur.execute("SHOW statement_timeout;")
+            assert cur.fetchone() == ("500ms",)
+            assert check_buffers(cur) is None
+            exec_may_cancel(
+                """
+                SELECT min(id) FROM test1;
+                """,
+                cur,
+                (1,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+    fp_disable()
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            # Do a select on the data, putting some buffers into the prefetch
+            # queue.
+            cur.execute(
+                """
+                SELECT count(id) FROM (select * from test1 LIMIT 256) a;
+                """
+            )
+            assert cur.fetchone() == (256,)
+
+            ps.stop()
+            ps.start()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test1;
+                """,
+                cur,
+                (1024,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}),
+            )
+
+    with closing(ep.connect(options=options, autocommit=True)) as conn:
+        with conn.cursor() as cur:
+            exec_may_cancel(
+                """
+                SELECT COUNT(id) FROM test2;
+                """,
+                cur,
+                (1024,),
+                failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}),
+            )
+
+            fp_disable()
+            fp_enable()
+
+            exec_may_cancel(
+                """
+                SELECT 0 < pg_database_size(CURRENT_DATABASE());
+                """,
+                cur,
+                (True,),
+                failpoint
+                in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}),
+            )
+
+            fp_disable()
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376)
+
+            cur.execute(
+                """
+                SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1;
+                """
+            )
+
+            assert cur.fetchone() == (1024, 1024, 1, 1024, 524800)
+
+    ep.stop()
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -177,7 +177,16 @@ def test_sharding_split_unsharded(
    env.storage_controller.consistency_check()


-def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "failpoint",
+    [
+        None,
+        "compact-shard-ancestors-localonly",
+        "compact-shard-ancestors-enqueued",
+        "compact-shard-ancestors-persistent",
+    ],
+)
+def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
    """
    Test that after a split, we clean up parent layer data in the child shards via compaction.
    """
@@ -196,6 +205,11 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
        "image_layer_creation_check_threshold": "0",
    }

+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        "max_unavailable": "300s"
+    }
+
    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
@@ -213,6 +227,10 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
    # Split one shard into two
    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)

+    # Let all shards move into their stable locations, so that during subsequent steps we
+    # don't have reconciles in progress (simpler to reason about what messages we expect in logs)
+    env.storage_controller.reconcile_until_idle()
+
    # Check we got the shard IDs we expected
    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
    assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
@@ -237,6 +255,90 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder):
    # Compaction shouldn't make anything unreadable
    workload.validate()

+    # Force a generation increase: layer rewrites are a long-term thing and only happen after
+    # the generation has increased.
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant
+    env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"})
+    env.storage_controller.reconcile_until_idle()
+
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior
+        # across restarts, as we will have local layer files that temporarily disagree with the remote metadata
+        # for the same local layer file name.
+        if failpoint is not None:
+            ps.http_client().configure_failpoints((failpoint, "exit"))
+
+        # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten)
+        # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively.
+        ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0)
+
+        # We will compare stats before + after compaction
+        detail_before = ps.http_client().timeline_detail(shard, timeline_id)
+
+        # Invoke compaction: this should rewrite layers that are behind the pitr horizon
+        try:
+            ps.http_client().timeline_compact(shard, timeline_id)
+        except requests.ConnectionError as e:
+            if failpoint is None:
+                raise e
+            else:
+                log.info(f"Compaction failed (failpoint={failpoint}): {e}")
+
+            if failpoint in (
+                "compact-shard-ancestors-localonly",
+                "compact-shard-ancestors-enqueued",
+            ):
+                # If we left local files that don't match remote metadata, we expect warnings on next startup
+                env.pageserver.allowed_errors.append(
+                    ".*removing local file .+ because it has unexpected length.*"
+                )
+
+            # Post-failpoint: we check that the pageserver comes back online happily.
+            env.pageserver.running = False
+            env.pageserver.start()
+        else:
+            assert failpoint is None  # We shouldn't reach success path if a failpoint was set
+
+            detail_after = ps.http_client().timeline_detail(shard, timeline_id)
+
+            # Physical size should shrink because layers are smaller
+            assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
+
+    # Validate size statistics
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+        timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
+        reported_size = timeline_info["current_physical_size"]
+        layer_paths = ps.list_layers(shard, timeline_id)
+        measured_size = 0
+        for p in layer_paths:
+            abs_path = ps.timeline_dir(shard, timeline_id) / p
+            measured_size += os.stat(abs_path).st_size
+
+        log.info(
+            f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)"
+        )
+
+        if failpoint in (
+            "compact-shard-ancestors-localonly",
+            "compact-shard-ancestors-enqueued",
+        ):
+            # If we injected a failure between local rewrite and remote upload, then after
+            # restart we may end up with neither version of the file on local disk (the new file
+            # is cleaned up because it doesn't matchc remote metadata).  So local size isn't
+            # necessarily going to match remote physical size.
+            continue
+
+        assert measured_size == reported_size
+
+    # Compaction shouldn't make anything unreadable
+    workload.validate()
+

 def test_sharding_split_smoke(
    neon_env_builder: NeonEnvBuilder,
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -1,13 +1,9 @@
 import datetime
 import enum
-import tarfile
-import time
 from concurrent.futures import ThreadPoolExecutor
-from hashlib import sha256
-from pathlib import Path
 from queue import Empty, Queue
 from threading import Barrier
-from typing import IO, List, Set, Tuple, Union
+from typing import List, Tuple

 import pytest
 from fixtures.common_types import Lsn, TimelineId
@@ -17,9 +13,10 @@ from fixtures.neon_fixtures import (
    PgBin,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import HistoricLayerInfo
-from fixtures.pageserver.utils import wait_timeline_detail_404
+from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
+from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
 from fixtures.remote_storage import LocalFsStorage
+from fixtures.utils import assert_pageserver_backups_equal


 def by_end_lsn(info: HistoricLayerInfo) -> Lsn:
@@ -68,7 +65,6 @@ SHUTDOWN_ALLOWED_ERRORS = [
@pytest.mark.parametrize("write_to_branch_first", [True, False])
 def test_ancestor_detach_branched_from(
    test_output_dir,
-    pg_distrib_dir,
    neon_env_builder: NeonEnvBuilder,
    pg_bin: PgBin,
    branchpoint: Branchpoint,
@@ -80,7 +76,6 @@ def test_ancestor_detach_branched_from(
    """
    env = neon_env_builder.init_start()

-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)

    client = env.pageserver.http_client()
@@ -160,16 +155,9 @@ def test_ancestor_detach_branched_from(
    # run fullbackup to make sure there are no off by one errors
    # take this on the parent
    fullbackup_before = test_output_dir / "fullbackup-before.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {env.initial_timeline} {branch_at}",
-        "-o",
-        str(fullbackup_before),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before
+    )

    all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
    assert all_reparented == set()
@@ -200,16 +188,9 @@ def test_ancestor_detach_branched_from(

    # take this on the detached, at same lsn
    fullbackup_after = test_output_dir / "fullbackup-after.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {timeline_id} {branch_at}",
-        "-o",
-        str(fullbackup_after),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after
+    )

    client.timeline_delete(env.initial_tenant, env.initial_timeline)
    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
@@ -218,52 +199,7 @@ def test_ancestor_detach_branched_from(
    # as there is always "PREV_LSN: invalid" for "before"
    skip_files = {"zenith.signal"}

-    tar_cmp(fullbackup_before, fullbackup_after, skip_files)
-
-
-def tar_cmp(left: Path, right: Path, skip_files: Set[str]):
-    """
-    This is essentially:
-
-    lines=$(comm -3 \
-        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
-        <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
-        | wc -l)
-    [ "$lines" = "0" ]
-
-    But in a more mac friendly fashion.
-    """
-    started_at = time.time()
-
-    def hash_extracted(reader: Union[IO[bytes], None]) -> bytes:
-        assert reader is not None
-        digest = sha256(usedforsecurity=False)
-        while True:
-            buf = reader.read(64 * 1024)
-            if not buf:
-                break
-            digest.update(buf)
-        return digest.digest()
-
-    def build_hash_list(p: Path) -> List[Tuple[str, bytes]]:
-        with tarfile.open(p) as f:
-            matching_files = (info for info in f if info.isreg() and info.name not in skip_files)
-            ret = list(
-                map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files)
-            )
-            ret.sort(key=lambda t: t[0])
-            return ret
-
-    left_list, right_list = map(build_hash_list, [left, right])
-
-    try:
-        assert len(left_list) == len(right_list)
-
-        for left_tuple, right_tuple in zip(left_list, right_list):
-            assert left_tuple == right_tuple
-    finally:
-        elapsed = time.time() - started_at
-        log.info(f"tar_cmp completed in {elapsed}s")
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)


 def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
@@ -483,7 +419,7 @@ def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEn


 def test_compaction_induced_by_detaches_in_history(
-    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_distrib_dir, pg_bin: PgBin
+    neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin
 ):
    """
    Assuming the tree of timelines:
@@ -500,8 +436,6 @@ def test_compaction_induced_by_detaches_in_history(
    timeline broken.
    """

-    psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")}
-
    env = neon_env_builder.init_start(
        initial_tenant_conf={
            # we want to create layers manually so we don't branch on arbitrary
@@ -589,16 +523,9 @@ def test_compaction_induced_by_detaches_in_history(

    # take the fullbackup before and after inheriting the new L0s
    fullbackup_before = test_output_dir / "fullbackup-before.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
-        "-o",
-        str(fullbackup_before),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before
+    )

    for _, timeline_id in skip_main:
        reparented = client.detach_ancestor(env.initial_tenant, timeline_id)
@@ -624,19 +551,38 @@ def test_compaction_induced_by_detaches_in_history(
    assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted"

    fullbackup_after = test_output_dir / "fullbackup_after.tar"
-    cmd = [
-        "psql",
-        "--no-psqlrc",
-        env.pageserver.connstr(),
-        "-c",
-        f"fullbackup {env.initial_tenant} {branch_timeline_id} {branch_lsn}",
-        "-o",
-        str(fullbackup_after),
-    ]
-    pg_bin.run_capture(cmd, env=psql_env)
+    pg_bin.take_fullbackup(
+        env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
+    )

    # we don't need to skip any files, because zenith.signal will be identical
-    tar_cmp(fullbackup_before, fullbackup_after, set())
+    assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
+
+
+def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    client = env.pageserver.http_client()
+
+    with pytest.raises(PageserverApiException, match=".* no ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, env.initial_timeline)
+    assert info.value.status_code == 409
+
+    first_branch = env.neon_cli.create_branch("first_branch")
+    second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch")
+
+    # funnily enough this does not have a prefix
+    with pytest.raises(PageserverApiException, match="too many ancestors") as info:
+        client.detach_ancestor(env.initial_tenant, second_branch)
+    assert info.value.status_code == 400
+
+    client.tenant_delete(env.initial_tenant)
+    wait_tenant_status_404(client, env.initial_tenant, 10, 1)
+
+    with pytest.raises(PageserverApiException) as e:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert e.value.status_code == 404


 # TODO:
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -194,7 +194,7 @@ files:

      - metric_name: pg_stats_userdb
        type: gauge
-        help: 'Stats for the oldest non-system db'
+        help: 'Stats for several oldest non-system dbs'
        key_labels:
          - datname
        value_label: kind
@@ -205,9 +205,8 @@ files:
          - inserted
          - updated
          - deleted
-        # We export stats for only one non-system database. Without this limit
+        # We export stats for 10 non-system database. Without this limit
        # it is too easy to abuse the system by creating lots of databases.
-        # We can try lifting this limit in the future after we understand the needs better.
        query: |
          select pg_database_size(datname) as db_size, deadlocks,
                 tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
@@ -218,7 +217,7 @@ files:
               from pg_database
              where datname <> 'postgres' and not datistemplate
              order by oid
-              limit 1
+              limit 10
           );

      - metric_name: max_cluster_size
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -13,6 +13,7 @@ publish = false

 ### BEGIN HAKARI SECTION
 [dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] }
 aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] }
@@ -85,6 +86,7 @@ zstd-safe = { version = "7", default-features = false, features = ["arrays", "le
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

 [build-dependencies]
+ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }