refactor(pageserver): better k-merge implementation for tiered compaction

Signed-off-by: Alex Chi Z <chi@neon.tech>
2026-05-18 13:40:37 +00:00 · 2024-05-20 17:13:40 -04:00
128 changed files with 2043 additions and 4979 deletions
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -3,13 +3,13 @@ description: 'Create Branch using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  project_id:
-    description: 'ID of the Project to create Branch in'
+    desctiption: 'ID of the Project to create Branch in'
    required: true
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build
 outputs:
  dsn:
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  project_id:
-    description: 'ID of the Project which should be deleted'
+    desctiption: 'ID of the Project which should be deleted'
    required: true
  branch_id:
-    description: 'ID of the branch to delete'
+    desctiption: 'ID of the branch to delete'
    required: true
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build

 runs:
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  region_id:
-    description: 'Region ID, if not set the project will be created in the default region'
+    desctiption: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 15'
-    default: '15'
+    desctiption: 'Postgres version; default is 15'
+    default: 15
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build
  provisioner:
-    description: 'k8s-pod or k8s-neonvm'
+    desctiption: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
  compute_units:
-    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'

 outputs:
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  project_id:
-    description: 'ID of the Project to delete'
+    desctiption: 'ID of the Project to delete'
    required: true
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build

 runs:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -548,7 +548,7 @@ jobs:

  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
+    if: github.ref_name == 'main' && needs.benchmarks.result == 'failure'
    runs-on: ubuntu-latest

    steps:
@@ -723,13 +723,9 @@ jobs:
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit

-  neon-image-arch:
+  neon-image:
    needs: [ check-permissions, build-build-tools-image, tag ]
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: [ self-hosted, gen3, large ]

    steps:
      - name: Checkout
@@ -751,6 +747,12 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
      - uses: docker/build-push-action@v5
        with:
          context: .
@@ -762,52 +764,25 @@ jobs:
          push: true
          pull: true
          file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
+          cache-from: type=registry,ref=neondatabase/neon:cache
+          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
          tags: |
-            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{needs.tag.outputs.build-tag}}

      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom

-  neon-image:
-    needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Create multi-arch image
-        run: |
-          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Push multi-arch image to ECR
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
-
-  compute-node-image-arch:
+  compute-node-image:
    needs: [ check-permissions, build-build-tools-image, tag ]
+    runs-on: [ self-hosted, gen3, large ]
+
    strategy:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
      - name: Checkout
@@ -854,14 +829,15 @@ jobs:
          push: true
          pull: true
          file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
          tags: |
-            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
-        if: matrix.version == 'v16'
+        if: ${{ matrix.version == 'v16' }}
        uses: docker/build-push-action@v5
        with:
          target: compute-tools-image
@@ -875,57 +851,14 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}

      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom

-  compute-node-image:
-    needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        version: [ v14, v15, v16 ]
-
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Create multi-arch compute-node image
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
-
-      - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-
-      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v16'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
@@ -933,8 +866,11 @@ jobs:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
+    defaults:
+      run:
+        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.29.3
+      VM_BUILDER_VERSION: v0.28.1

    steps:
      - name: Checkout
@@ -947,48 +883,26 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
      # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
      # it won't have the proper authentication (written at v0.6.0)
      - name: Pulling compute-node image
        run: |
-          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Build vm image
        run: |
          ./vm-builder \
            -spec=vm-image-spec.yaml \
-            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
+          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
-    strategy:
-      fail-fast: false
-      matrix:
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+    runs-on: [ self-hosted, gen3, small ]

    steps:
      - name: Checkout
@@ -1006,7 +920,7 @@ jobs:
      - name: Verify image versions
        shell: bash # ensure no set -e for better error messages
        run: |
-          pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")

          echo "Pageserver version string: $pageserver_version"

@@ -1032,48 +946,78 @@ jobs:

  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-latest
-
-    env:
-      VERSIONS: v14 v15 v16
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+    # Don't add if-condition here.
+    # The job should always be run because we have dependant other jobs that shouldn't be skipped

    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Copy vm-compute-node images to ECR
+      - name: Install Crane & ECR helper
        run: |
-          for version in ${VERSIONS}; do
-            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
-                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-          done
+          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Copy vm-compute-node images to Docker Hub
+        run: |
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
-        if: github.ref_name == 'main'
+        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
        run: |
-          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
-            docker buildx imagetools create -t $repo/neon:latest \
-                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

-            docker buildx imagetools create -t $repo/compute-tools:latest \
-                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
+      - name: Push images to production ECR
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        run: |
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest

-            for version in ${VERSIONS}; do
-              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+      - name: Configure Docker Hub login
+        run: |
+          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
+          echo "" > /github/home/.docker/config.json
+          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io

-              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-            done
-          done
+      - name: Push vm-compute-node to Docker Hub
+        run: |
+          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
+
+      - name: Push latest tags to Docker Hub
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        run: |
+          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,7 +53,7 @@ jobs:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Storage & Compute release ${RELEASE_DATE}
+          ## Release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1471,21 +1471,26 @@ dependencies = [

 [[package]]
 name = "crossbeam-deque"
-version = "0.8.5"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
+checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
 dependencies = [
+ "cfg-if",
 "crossbeam-epoch",
 "crossbeam-utils",
 ]

 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.18"
+version = "0.9.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
 dependencies = [
+ "autocfg",
+ "cfg-if",
 "crossbeam-utils",
+ "memoffset 0.8.0",
+ "scopeguard",
 ]

 [[package]]
@@ -3956,9 +3961,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"

 [[package]]
 name = "pbkdf2"
-version = "0.12.2"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
+checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
@@ -4105,6 +4110,17 @@ dependencies = [
 "tokio-postgres",
 ]

+[[package]]
+name = "postgres-native-tls"
+version = "0.5.0"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+dependencies = [
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
@@ -4370,7 +4386,6 @@ dependencies = [
 name = "proxy"
 version = "0.1.0"
 dependencies = [
- "ahash",
 "anyhow",
 "async-compression",
 "async-trait",
@@ -4387,7 +4402,6 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
- "crossbeam-deque",
 "dashmap",
 "env_logger",
 "fallible-iterator",
@@ -4412,6 +4426,7 @@ dependencies = [
 "md5",
 "measured",
 "metrics",
+ "native-tls",
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
@@ -4419,6 +4434,7 @@ dependencies = [
 "parquet_derive",
 "pbkdf2",
 "pin-project-lite",
+ "postgres-native-tls",
 "postgres-protocol",
 "postgres_backend",
 "pq_proto",
@@ -4466,7 +4482,7 @@ dependencies = [
 "utils",
 "uuid",
 "walkdir",
- "webpki-roots 0.26.1",
+ "webpki-roots 0.25.2",
 "workspace_hack",
 "x509-parser",
 ]
@@ -5219,20 +5235,20 @@ dependencies = [
 "hex",
 "histogram",
 "itertools",
+ "native-tls",
 "pageserver",
 "pageserver_api",
+ "postgres-native-tls",
 "postgres_ffi",
 "rand 0.8.5",
 "remote_storage",
 "reqwest 0.12.4",
- "rustls 0.22.4",
 "serde",
 "serde_json",
 "serde_with",
 "thiserror",
 "tokio",
 "tokio-postgres",
- "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
 "tokio-stream",
 "tokio-util",
@@ -5240,7 +5256,6 @@ dependencies = [
 "tracing-appender",
 "tracing-subscriber",
 "utils",
- "webpki-roots 0.26.1",
 "workspace_hack",
 ]

@@ -5831,15 +5846,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"

-[[package]]
-name = "statx-sys"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69c325f46f705b7a66fb87f0ebb999524a7363f30f05d373277b4ef7f409fe87"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "storage_broker"
 version = "0.1.0"
@@ -6263,7 +6269,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=arpad/statx_sys#ca8446b8edb5e0aef88520f2fc209a13a834fd25"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "futures",
 "nix 0.26.4",
@@ -6794,12 +6800,11 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=arpad/statx_sys#ca8446b8edb5e0aef88520f2fc209a13a834fd25"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
 dependencies = [
 "bytes",
 "io-uring",
 "libc",
- "statx-sys",
 ]

 [[package]]
@@ -7468,7 +7473,6 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
- "ahash",
 "anyhow",
 "aws-config",
 "aws-runtime",
@@ -7627,9 +7631,9 @@ dependencies = [

 [[package]]
 name = "zeroize"
-version = "1.7.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
+checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
 dependencies = [
 "zeroize_derive",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,7 +41,6 @@ license = "Apache-2.0"

 ## All dependency versions, used in the project
 [workspace.dependencies]
-ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -75,7 +74,6 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
-crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
@@ -171,7 +169,7 @@ thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "arpad/statx_sys" }
+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.11.0"
 tokio-rustls = "0.25"
@@ -191,7 +189,7 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-webpki-roots = "0.26"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -200,6 +198,7 @@ log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
@@ -240,7 +239,8 @@ tonic-build = "0.9"

 [patch.crates-io]

-# Needed to get `tokio-postgres-rustls` to depend on our fork.
+# This is only needed for proxy's tests.
+# TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 # bug fixes for UUID
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -243,15 +243,12 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY patches/pgvector.patch /pgvector.patch

-# By default, pgvector Makefile uses `-march=native`. We don't want that, 
-# because we build the images on different machines than where we run them.
-# Pass OPTFLAGS="" to remove it.
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

 #########################################################################################
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -1,5 +1,3 @@
-use std::path::Path;
-
 use anyhow::{anyhow, Context};
 use tracing::warn;

@@ -19,24 +17,17 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
        .arg(size_bytes.to_string())
        .spawn();

+    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
+        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+        return Ok(());
+    }
+
    child_result
        .context("spawn() failed")
        .and_then(|mut child| child.wait().context("wait() failed"))
        .and_then(|status| match status.success() {
            true => Ok(()),
-            false => {
-                // The command failed. Maybe it was because the resize-swap file doesn't exist?
-                // The --once flag causes it to delete itself on success so we don't disable swap
-                // while postgres is running; maybe this is fine.
-                match Path::new(RESIZE_SWAP_BIN).try_exists() {
-                    Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
-                    // The path doesn't exist; we're actually ok 
-                    Ok(false) => {
-                        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
-                        Ok(())
-                    },
-                }
-            }
+            false => Err(anyhow!("process exited with {status}")),
        })
        // wrap any prior error with the overall context that we couldn't run the command
        .with_context(|| {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -243,13 +243,9 @@ impl StorageController {
                anyhow::bail!("initdb failed with status {status}");
            }

-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
            tokio::fs::write(
                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", self.postgres_port),
+                format!("port = {}", self.postgres_port),
            )
            .await?;
        };
--- a/deny.toml
+++ b/deny.toml
@@ -99,10 +99,6 @@ name = "async-executor"
 [[bans.deny]]
 name = "smol"

-[[bans.deny]]
-# We want to use rustls instead of native-tls.
-name = "postgres-native-tls"
-
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -1,4 +1,4 @@
-ARG REPOSITORY=neondatabase
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
 ARG COMPUTE_IMAGE=compute-node-v14
 ARG TAG=latest

--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -8,6 +8,8 @@
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).

+# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
+
 set -eux -o pipefail

 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
    sync::atomic::AtomicUsize,
    time::{Duration, SystemTime},
 };
@@ -161,22 +162,6 @@ impl std::fmt::Debug for TenantState {
    }
 }

-/// A temporary lease to a specific lsn inside a timeline.
-/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
-#[serde_as]
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct LsnLease {
-    #[serde_as(as = "SystemTimeAsRfc3339Millis")]
-    pub valid_until: SystemTime,
-}
-
-serde_with::serde_conv!(
-    SystemTimeAsRfc3339Millis,
-    SystemTime,
-    |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
-    |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
-);
-
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
@@ -305,7 +290,7 @@ pub struct TenantConfig {
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -333,28 +318,14 @@ pub struct TenantConfig {
 /// Unset -> V1
 ///       -> V2
 ///       -> CrossValidation -> V2
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum AuxFilePolicy {
    /// V1 aux file policy: store everything in AUX_FILE_KEY
-    #[strum(ascii_case_insensitive)]
    V1,
    /// V2 aux file policy: store in the AUX_FILE keyspace
-    #[strum(ascii_case_insensitive)]
    V2,
    /// Cross validation runs both formats on the write path and does validation
    /// on the read path.
-    #[strum(ascii_case_insensitive)]
    CrossValidation,
 }

@@ -420,6 +391,23 @@ impl AuxFilePolicy {
    }
 }

+impl FromStr for AuxFilePolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.to_lowercase();
+        if s == "v1" {
+            Ok(Self::V1)
+        } else if s == "v2" {
+            Ok(Self::V2)
+        } else if s == "crossvalidation" || s == "cross_validation" {
+            Ok(Self::CrossValidation)
+        } else {
+            anyhow::bail!("cannot parse {} to aux file policy", s)
+        }
+    }
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -438,28 +426,13 @@ impl EvictionPolicy {
    }
 }

-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
 pub enum CompactionAlgorithm {
    Legacy,
    Tiered,
 }

-#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
-pub struct CompactionAlgorithmSettings {
-    pub kind: CompactionAlgorithm,
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -816,8 +789,6 @@ pub enum HistoricLayerInfo {
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
-
-        l0: bool,
    },
    Image {
        layer_file_name: String,
@@ -1416,7 +1387,6 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
    use serde_json::json;
-    use std::str::FromStr;

    use super::*;

@@ -1679,14 +1649,4 @@ mod tests {
            AuxFilePolicy::V2
        ));
    }
-
-    #[test]
-    fn test_aux_parse() {
-        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(
-            AuxFilePolicy::from_str("cross-validation").unwrap(),
-            AuxFilePolicy::CrossValidation
-        );
-    }
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -559,14 +559,6 @@ impl ShardIdentity {
        }
    }

-    /// Obtains the shard number and count combined into a `ShardIndex`.
-    pub fn shard_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_count: self.count,
-            shard_number: self.number,
-        }
-    }
-
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -178,13 +178,6 @@ impl PgConnectionConfig {
    }
 }

-impl fmt::Display for PgConnectionConfig {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        // The password is intentionally hidden and not part of this display string.
-        write!(f, "postgresql://{}:{}", self.host, self.port)
-    }
-}
-
 impl fmt::Debug for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -373,29 +373,31 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
                &[&(repeats as i32)],
            )?;
-            info!(
-                "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
-                client.pg_current_wal_insert_lsn()?,
-                XLOG_SIZE_OF_XLOG_RECORD
-            );
-
-            // Emit the XLOG_SWITCH
-            let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-            let xlog_switch_record_end: PgLsn =
-                client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-
-            if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
-                != XLOG_SIZE_OF_XLOG_SHORT_PHD
-            {
-                warn!(
-                    "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
-                    xlog_switch_record_end,
-                    u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
-                );
-                continue;
-            }
-            return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
+            break;
        }
+        info!(
+            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+            client.pg_current_wal_insert_lsn()?,
+            XLOG_SIZE_OF_XLOG_RECORD
+        );
+
+        // Emit the XLOG_SWITCH
+        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let next_segment = PgLsn::from(0x0200_0000);
+        ensure!(
+            xlog_switch_record_end < next_segment,
+            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
+            xlog_switch_record_end,
+            next_segment
+        );
+        ensure!(
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
+            xlog_switch_record_end,
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+        );
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
    }
 }

--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -29,6 +29,7 @@ use http_types::{StatusCode, Url};
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

+use crate::RemoteStorageActivity;
 use crate::{
    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
@@ -525,6 +526,10 @@ impl RemoteStorage for AzureBlobStorage {
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
        Err(TimeTravelError::Unimplemented)
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -263,6 +263,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
        done_if_after: SystemTime,
        cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError>;
+
+    /// Query how busy we currently are: may be used by callers which wish to politely
+    /// back off if there are already a lot of operations underway.
+    fn activity(&self) -> RemoteStorageActivity;
+}
+
+pub struct RemoteStorageActivity {
+    pub read_available: usize,
+    pub read_total: usize,
+    pub write_available: usize,
+    pub write_total: usize,
 }

 /// DownloadStream is sensitive to the timeout and cancellation used with the original
@@ -444,6 +455,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
            }
        }
    }
+
+    pub fn activity(&self) -> RemoteStorageActivity {
+        match self {
+            Self::LocalFs(s) => s.activity(),
+            Self::AwsS3(s) => s.activity(),
+            Self::AzureBlob(s) => s.activity(),
+            Self::Unreliable(s) => s.activity(),
+        }
+    }
 }

 impl GenericRemoteStorage {
@@ -774,6 +794,9 @@ struct ConcurrencyLimiter {
    // The helps to ensure we don't exceed the thresholds.
    write: Arc<Semaphore>,
    read: Arc<Semaphore>,
+
+    write_total: usize,
+    read_total: usize,
 }

 impl ConcurrencyLimiter {
@@ -802,10 +825,21 @@ impl ConcurrencyLimiter {
        Arc::clone(self.for_kind(kind)).acquire_owned().await
    }

+    fn activity(&self) -> RemoteStorageActivity {
+        RemoteStorageActivity {
+            read_available: self.read.available_permits(),
+            read_total: self.read_total,
+            write_available: self.write.available_permits(),
+            write_total: self.write_total,
+        }
+    }
+
    fn new(limit: usize) -> ConcurrencyLimiter {
        Self {
            read: Arc::new(Semaphore::new(limit)),
            write: Arc::new(Semaphore::new(limit)),
+            read_total: limit,
+            write_total: limit,
        }
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use utils::crashsafe::path_with_suffix_extension;

 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorageActivity,
+    TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -605,6 +605,16 @@ impl RemoteStorage for LocalFs {
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        // LocalFS has no concurrency limiting: give callers the impression that plenty of units are available
+        RemoteStorageActivity {
+            read_available: 16,
+            read_total: 16,
+            write_available: 16,
+            write_total: 16,
+        }
+    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -47,8 +47,8 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
-    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
-    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Listing, ListingMode, RemotePath, RemoteStorage, RemoteStorageActivity, S3Config,
+    TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -975,6 +975,10 @@ impl RemoteStorage for S3Bucket {
        }
        Ok(())
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.concurrency_limiter.activity()
+    }
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -12,7 +12,7 @@ use tokio_util::sync::CancellationToken;

 use crate::{
    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata, TimeTravelError,
+    RemoteStorageActivity, StorageMetadata, TimeTravelError,
 };

 pub struct UnreliableWrapper {
@@ -213,4 +213,8 @@ impl RemoteStorage for UnreliableWrapper {
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
    }
+
+    fn activity(&self) -> RemoteStorageActivity {
+        self.inner.activity()
+    }
 }
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -50,9 +50,6 @@ pub struct SkTimelineInfo {
    pub safekeeper_connstr: Option<String>,
    #[serde(default)]
    pub http_connstr: Option<String>,
-    // Minimum of all active RO replicas flush LSN
-    #[serde(default = "lsn_invalid")]
-    pub standby_horizon: Lsn,
 }

 #[derive(Debug, Clone, Deserialize, Serialize)]
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,8 +135,7 @@ impl Gate {
        let started_at = std::time::Instant::now();
        let mut do_close = std::pin::pin!(self.do_close());

-        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
-        let nag_after = Duration::from_millis(100);
+        let nag_after = Duration::from_secs(1);

        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
            return;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -496,9 +496,9 @@ mod tests {
                // TODO: When updating Postgres versions, this test will cause
                // problems. Postgres version in message needs updating.
                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -6,14 +6,10 @@ use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
 use pageserver_api::shard::ShardIdentity;
-use pin_project_lite::pin_project;
-use std::collections::BinaryHeap;
 use std::collections::VecDeque;
+use std::collections::{binary_heap, BinaryHeap};
 use std::fmt::Display;
-use std::future::Future;
-use std::ops::{DerefMut, Range};
-use std::pin::Pin;
-use std::task::{ready, Poll};
+use std::ops::Range;
 use utils::lsn::Lsn;

 pub const PAGE_SZ: u64 = 8192;
@@ -85,33 +81,6 @@ pub fn intersect_keyspace<K: Ord + Clone + Copy>(
    ranges
 }

-/// Create a stream that iterates through all DeltaEntrys among all input
-/// layers, in key-lsn order.
-///
-/// This is public because the create_delta() implementation likely wants to use this too
-/// TODO: move to a more shared place
-pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
-    layers: &'a [E::DeltaLayer],
-    ctx: &'a E::RequestContext,
-) -> MergeDeltaKeys<'a, E> {
-    // Use a binary heap to merge the layers. Each input layer is initially
-    // represented by a LazyLoadLayer::Unloaded element, which uses the start of
-    // the layer's key range as the key. The first time a layer reaches the top
-    // of the heap, all the keys of the layer are loaded into a sorted vector.
-    //
-    // This helps to keep the memory usage reasonable: we only need to hold in
-    // memory the DeltaEntrys of the layers that overlap with the "current" key.
-    let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
-    for l in layers {
-        heap.push(LazyLoadLayer::Unloaded(l));
-    }
-    MergeDeltaKeys {
-        heap,
-        ctx,
-        load_future: None,
-    }
-}
-
 pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
    layers: &'a [E::DeltaLayer],
    ctx: &'a E::RequestContext,
@@ -129,104 +98,139 @@ pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>(
    Ok(stream)
 }

-enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
-    Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
-    Unloaded(&'a E::DeltaLayer),
+/// Wrapper type to make `dl.load_keys`` compile.
+type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
+
+pub enum LayerIterator<'a, E: CompactionJobExecutor> {
+    Loaded(
+        VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>,
+        &'a E::RequestContext,
+    ),
+    Unloaded(&'a E::DeltaLayer, &'a E::RequestContext),
 }
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
-    fn min_key(&self) -> E::Key {
+
+impl<'a, E: CompactionJobExecutor + 'a> LayerIterator<'a, E> {
+    pub fn new(delta_layer: &'a E::DeltaLayer, ctx: &'a E::RequestContext) -> Self {
+        Self::Unloaded(delta_layer, ctx)
+    }
+
+    pub fn key_lsn(&self) -> (E::Key, Lsn) {
        match self {
-            Self::Loaded(entries) => entries.front().unwrap().key(),
-            Self::Unloaded(dl) => dl.key_range().start,
+            Self::Unloaded(dl, _) => (dl.key_range().start, dl.lsn_range().start),
+            Self::Loaded(entries, _) => entries.front().map(|x| (x.key(), x.lsn())).unwrap(),
        }
    }
-    fn min_lsn(&self) -> Lsn {
+
+    async fn load(&mut self) -> anyhow::Result<()> {
        match self {
-            Self::Loaded(entries) => entries.front().unwrap().lsn(),
-            Self::Unloaded(dl) => dl.lsn_range().start,
+            Self::Unloaded(dl, ctx) => {
+                let unloaded_key_lsn = (dl.key_range().start, dl.lsn_range().start);
+                let fut: LoadFuture<
+                    'a,
+                    <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>,
+                > = Box::pin(dl.load_keys(ctx));
+                let keys = VecDeque::from(fut.await?);
+                assert_eq!(
+                    keys.front().as_ref().map(|x| (x.key(), x.lsn())).unwrap(),
+                    unloaded_key_lsn,
+                    "unmatched start key_lsn"
+                );
+                *self = Self::Loaded(keys, ctx);
+                Ok(())
+            }
+            Self::Loaded(_, _) => Ok(()),
+        }
+    }
+
+    pub async fn entry(
+        &mut self,
+    ) -> anyhow::Result<&<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>> {
+        self.load().await?;
+        let Self::Loaded(x, _) = self else {
+            unreachable!()
+        };
+        Ok(x.front().unwrap())
+    }
+
+    pub async fn next(
+        &mut self,
+    ) -> anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>> {
+        self.load().await?; // requires Box::pin to make it compile
+        let Self::Loaded(x, _) = self else {
+            unreachable!()
+        };
+        Ok(x.pop_front().expect("already reached the end"))
+    }
+
+    pub fn is_end(&self) -> bool {
+        match self {
+            Self::Unloaded(_, _) => false,
+            Self::Loaded(x, _) => x.is_empty(),
        }
    }
 }
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+
+impl<'a, E: CompactionJobExecutor + 'a> PartialOrd for LayerIterator<'a, E> {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
 }
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+
+impl<'a, E: CompactionJobExecutor + 'a> Ord for LayerIterator<'a, E> {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        // reverse order so that we get a min-heap
-        (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
+        // reverse comparison to get a min-heap
+        other.key_lsn().cmp(&self.key_lsn())
    }
 }
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+
+impl<'a, E: CompactionJobExecutor + 'a> PartialEq for LayerIterator<'a, E> {
    fn eq(&self, other: &Self) -> bool {
        self.cmp(other) == std::cmp::Ordering::Equal
    }
 }
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}

-type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
+impl<'a, E: CompactionJobExecutor + 'a> Eq for LayerIterator<'a, E> {}

-// Stream returned by `merge_delta_keys`
-pin_project! {
-#[allow(clippy::type_complexity)]
-pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
-    heap: BinaryHeap<LazyLoadLayer<'a, E>>,
-
-    #[pin]
-    load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
-
-    ctx: &'a E::RequestContext,
-}
+pub struct DeltaMergeIterator<'a, E: CompactionJobExecutor> {
+    heap: BinaryHeap<LayerIterator<'a, E>>,
 }

-impl<'a, E> Stream for MergeDeltaKeys<'a, E>
-where
-    E: CompactionJobExecutor + 'a,
-{
-    type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
+impl<'a, E: CompactionJobExecutor + 'a> DeltaMergeIterator<'a, E> {
+    pub fn new(delta_layers: &'a [E::DeltaLayer], ctx: &'a E::RequestContext) -> Self {
+        let mut heap = BinaryHeap::new();
+        for dl in delta_layers {
+            heap.push(LayerIterator::new(dl, ctx));
+        }
+        Self { heap }
+    }

-    fn poll_next(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
-        let mut this = self.project();
-        loop {
-            if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
-                // We are waiting for loading the keys to finish
-                match ready!(load_future.as_mut().poll(cx)) {
-                    Ok(entries) => {
-                        this.load_future.set(None);
-                        *this.heap.peek_mut().unwrap() =
-                            LazyLoadLayer::Loaded(VecDeque::from(entries));
-                    }
-                    Err(e) => {
-                        return Poll::Ready(Some(Err(e)));
-                    }
+    pub fn is_end(&self) -> bool {
+        self.heap.is_empty()
+    }
+
+    /// The next key-lsn entry that will be returned by `next`.
+    pub fn key_lsn(&self) -> (E::Key, Lsn) {
+        self.heap.peek().expect("already reached the end").key_lsn()
+    }
+
+    /// Move to the next entry and return the current entry.
+    pub async fn next(
+        &mut self,
+    ) -> anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>> {
+        let Some(mut top) = self.heap.peek_mut() else {
+            panic!("already reached the end")
+        };
+        match top.next().await {
+            Ok(entry) => {
+                if top.is_end() {
+                    binary_heap::PeekMut::pop(top);
                }
+                Ok(entry)
            }
-
-            // If the topmost layer in the heap hasn't been loaded yet, start
-            // loading it. Otherwise return the next entry from it and update
-            // the layer's position in the heap (this decreaseKey operation is
-            // performed implicitly when `top` is dropped).
-            if let Some(mut top) = this.heap.peek_mut() {
-                match top.deref_mut() {
-                    LazyLoadLayer::Unloaded(ref mut l) => {
-                        let fut = l.load_keys(this.ctx);
-                        this.load_future.set(Some(Box::pin(fut)));
-                        continue;
-                    }
-                    LazyLoadLayer::Loaded(ref mut entries) => {
-                        let result = entries.pop_front().unwrap();
-                        if entries.is_empty() {
-                            std::collections::binary_heap::PeekMut::pop(top);
-                        }
-                        return Poll::Ready(Some(Ok(result)));
-                    }
-                }
-            } else {
-                return Poll::Ready(None);
+            Err(e) => {
+                // pop the item if there is an error, otherwise it might cause further panic when binary heap compares it after `PeekMut` gets dropped.
+                binary_heap::PeekMut::pop(top);
+                Err(e)
            }
        }
    }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -92,7 +92,9 @@ pub trait CompactionJobExecutor {
    ) -> impl Future<Output = anyhow::Result<()>> + Send;
 }

-pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
+pub trait CompactionKey:
+    std::cmp::Ord + Clone + Copy + std::fmt::Display + std::fmt::Debug
+{
    const MIN: Self;
    const MAX: Self;

--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -2,7 +2,6 @@ mod draw;

 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};

-use futures::StreamExt;
 use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;
@@ -15,7 +14,8 @@ use std::sync::Arc;
 use std::sync::Mutex;

 use crate::helpers::PAGE_SZ;
-use crate::helpers::{merge_delta_keys, overlaps_with};
+use crate::helpers::overlaps_with;
+use crate::helpers::DeltaMergeIterator;

 use crate::interface;
 use crate::interface::CompactionLayer;
@@ -380,8 +380,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
    }
    fn file_size(&self) -> u64 {
        match self {
-            MockLayer::Delta(this) => this.file_size,
-            MockLayer::Image(this) => this.file_size,
+            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size(),
        }
    }
    fn short_id(&self) -> String {
@@ -545,12 +545,11 @@ impl interface::CompactionJobExecutor for MockTimeline {
        input_layers: &[Arc<MockDeltaLayer>],
        ctx: &MockRequestContext,
    ) -> anyhow::Result<()> {
-        let mut key_value_stream =
-            std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
+        let mut key_value_stream = DeltaMergeIterator::<MockTimeline>::new(input_layers, ctx);
        let mut records: Vec<MockRecord> = Vec::new();
        let mut total_len = 2;
-        while let Some(delta_entry) = key_value_stream.next().await {
-            let delta_entry: MockRecord = delta_entry?;
+        while !key_value_stream.is_end() {
+            let delta_entry: MockRecord = key_value_stream.next().await?;
            if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
                total_len += delta_entry.len;
                records.push(delta_entry);
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;

 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            #[derive(serde::Serialize)]
            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
+                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
                disk_consistent_lsn: Lsn,
                timeline_metadata: &'a TimelineMetadata,
            }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -534,7 +534,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    });
                }
                EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size;
+                    let file_size = layer.metadata.file_size();

                    js.spawn(async move {
                        layer
@@ -641,7 +641,7 @@ impl EvictionLayer {
    pub(crate) fn get_file_size(&self) -> u64 {
        match self {
            Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size,
+            Self::Secondary(sl) => sl.metadata.file_size(),
        }
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -257,37 +257,6 @@ paths:
              schema:
                $ref: "#/components/schemas/LsnByTimestampResponse"

-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Obtain lease for the given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: string
-            format: hex
-          description: A LSN to obtain the lease for
-      responses:
-        "200":
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/LsnLease"
-
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -612,80 +581,6 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        ŕequired: true
-        schema:
-          type: string
-
-    put:
-      description: |
-        Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
-        Current implementation might not be retryable across failure cases, but will be enhanced in future.
-        Detaching should be expected to be expensive operation. Timeouts should be retried.
-      responses:
-        "200":
-          description: |
-            The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
-            If any timelines were deleted after reparenting, they might not be on this list.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/AncestorDetached"
-
-        "400":
-          description: |
-            Number of early checks meaning the timeline cannot be detached now:
-              - the ancestor of timeline has an ancestor: not supported, see RFC
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
-        "404":
-          description: Tenant or timeline not found.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-
-        "409":
-          description: |
-            The timeline can never be detached:
-              - timeline has no ancestor, implying that the timeline has never had an ancestor
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-
-        "500":
-          description: |
-            Transient error, for example, pageserver shutdown happened while
-            processing the request but we were unable to distinguish that. Must
-            be retried.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
-        "503":
-          description: |
-            Temporarily unavailable, please retry. Possible reasons:
-              - another timeline detach for the same tenant is underway, please retry later
-              - detected shutdown error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -1085,15 +980,6 @@ components:
          type: string
          enum: [past, present, future, nodata]

-    LsnLease:
-      type: object
-      required:
-        - valid_until
-      properties:
-        valid_until:
-          type: string
-          format: date-time
-
    PageserverUtilization:
      type: object
      required:
@@ -1151,19 +1037,6 @@ components:
          format: int64
          description: How many bytes of layer content were in the latest layer heatmap

-    AncestorDetached:
-      type: object
-      required:
-        - reparented_timelines
-      properties:
-        reparented_timelines:
-          type: array
-          description: Set of reparented timeline ids
-          properties:
-            type: string
-            format: hex
-            description: TimelineId
-

    Error:
      type: object
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -16,7 +16,6 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
@@ -75,7 +74,6 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -281,13 +279,6 @@ impl From<GetTenantError> for ApiError {
    }
 }

-impl From<GetTimelineError> for ApiError {
-    fn from(gte: GetTimelineError) -> Self {
-        // Rationale: tenant is activated only after eligble timelines activate
-        ApiError::NotFound(gte.into())
-    }
-}
-
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
@@ -395,7 +386,7 @@ async fn build_timeline_info_common(
        let guard = timeline.last_received_wal.lock().unwrap();
        if let Some(info) = guard.as_ref() {
            (
-                Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
+                Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
                Some(info.last_received_msg_lsn),
                Some(info.last_received_msg_ts),
            )
@@ -652,7 +643,9 @@ async fn timeline_preserve_initdb_handler(
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;

-        let timeline = tenant.get_timeline(timeline_id, false)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(|e| ApiError::NotFound(e.into()))?;

        timeline
            .preserve_initdb_archive()
@@ -694,7 +687,9 @@ async fn timeline_detail_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        let timeline = tenant.get_timeline(timeline_id, false)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(|e| ApiError::NotFound(e.into()))?;

        let timeline_info = build_timeline_info(
            &timeline,
@@ -1706,32 +1701,6 @@ async fn handle_tenant_break(
    json_response(StatusCode::OK, ())
 }

-// Obtains an lsn lease on the given timeline.
-async fn lsn_lease_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-
-    let state = get_state(&request);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-    let result = timeline
-        .make_lsn_lease(lsn, &ctx)
-        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
-
-    json_response(StatusCode::OK, result)
-}
-
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
    mut request: Request<Body>,
@@ -1767,8 +1736,6 @@ async fn timeline_compact_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
-    let wait_until_uploaded =
-        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1777,9 +1744,6 @@ async fn timeline_compact_handler(
            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
-        }
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1804,8 +1768,6 @@ async fn timeline_checkpoint_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
-    let wait_until_uploaded =
-        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1819,10 +1781,6 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
-        }
-
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1906,11 +1864,14 @@ async fn timeline_detach_ancestor_handler(
        let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
        let ctx = &ctx;

-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| ApiError::NotFound(e.into()))?;

        let (_guard, prepared) = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
-            .await?;
+            .await
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        let res = state
            .tenant_manager
@@ -2044,7 +2005,9 @@ async fn active_timeline_of_active_tenant(

    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-    Ok(tenant.get_timeline(timeline_id, true)?)
+    tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| ApiError::NotFound(e.into()))
 }

 async fn always_panic_handler(
@@ -2308,31 +2271,6 @@ async fn post_tracing_event_handler(
    json_response(StatusCode::OK, ())
 }

-async fn force_aux_policy_switch_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
-    let policy: AuxFilePolicy = json_request(&mut r).await?;
-
-    let state = get_state(&r);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-    timeline
-        .do_switch_aux_policy(policy)
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn put_io_engine_handler(
    mut r: Request<Body>,
    _cancel: CancellationToken,
@@ -2410,9 +2348,19 @@ async fn list_aux_files(
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
-    json_response(StatusCode::OK, files)
+    let process = || async move {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let files = timeline.list_aux_files(body.lsn, &ctx).await?;
+        Ok::<_, anyhow::Error>(files)
+    };
+
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => json_response(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            ApiError::InternalServerError(err).to_string(),
+        ),
+    }
 }

 async fn ingest_aux_files(
@@ -2430,22 +2378,24 @@ async fn ingest_aux_files(
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;

-    let mut modification = timeline.begin_modification(
-        Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
-    );
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    for (fname, content) in body.aux_files {
-        modification
-            .put_file(&fname, content.as_bytes(), &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-    }
-    modification
-        .commit(&ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let process = || async move {
+        let mut modification = timeline.begin_modification(Lsn(
+            timeline.get_last_record_lsn().0 + 8
+        ) /* advance LSN by 8 */);
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        for (fname, content) in body.aux_files {
+            modification
+                .put_file(&fname, content.as_bytes(), &ctx)
+                .await?;
+        }
+        modification.commit(&ctx).await?;
+        Ok::<_, anyhow::Error>(())
+    };

-    json_response(StatusCode::OK, ())
+    match process().await {
+        Ok(st) => json_response(StatusCode::OK, st),
+        Err(err) => Err(ApiError::InternalServerError(err)),
+    }
 }

 /// Report on the largest tenants on this pageserver, for the storage controller to identify
@@ -2751,10 +2701,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
-            |r| api_handler(r, lsn_lease_handler),
-        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
            |r| api_handler(r, timeline_gc_handler),
@@ -2828,10 +2774,6 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
-            |r| api_handler(r, force_aux_policy_switch_handler),
-        )
        .get("/v1/utilization", |r| api_handler(r, get_utilization))
        .post(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,15 +525,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_standby_horizon",
-        "Standby apply LSN for which GC is hold off, by timeline.",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
@@ -1867,6 +1858,7 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
+    pub(crate) time_spent_on_ingest: Histogram,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1890,6 +1882,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
+    time_spent_on_ingest: register_histogram!(
+        "pageserver_wal_ingest_put_value_seconds",
+        "Actual time spent on ingesting a record",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric"),
 });

 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2100,7 +2098,6 @@ pub(crate) struct TimelineMetrics {
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
@@ -2170,9 +2167,6 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-        let standby_horizon_gauge = STANDBY_HORIZON
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2218,7 +2212,6 @@ impl TimelineMetrics {
            find_gc_cutoffs_histo,
            load_layer_map_histo,
            last_record_gauge,
-            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            aux_file_size_gauge,
@@ -2253,7 +2246,6 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,7 +19,6 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -34,7 +33,6 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use std::time::Instant;
-use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
@@ -260,8 +258,6 @@ async fn page_service_conn_main(
    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

-    fail::fail_point!("ps::connection-start::pre-login");
-
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
@@ -605,7 +601,6 @@ impl PageServerHandler {
            };

            trace!("query: {copy_data_bytes:?}");
-            fail::fail_point!("ps::handle-pagerequest-message");

            // Trace request if needed
            if let Some(t) = tracer.as_mut() {
@@ -620,7 +615,6 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -630,7 +624,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -640,7 +633,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
@@ -651,7 +643,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -661,7 +652,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -915,39 +905,6 @@ impl PageServerHandler {
        }
    }

-    #[instrument(skip_all, fields(shard_id, %lsn))]
-    async fn handle_make_lsn_lease<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
-        let timeline = self
-            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
-            .await?;
-        let lease = timeline.make_lsn_lease(lsn, ctx)?;
-        let valid_until = lease
-            .valid_until
-            .duration_since(SystemTime::UNIX_EPOCH)
-            .map_err(|e| QueryError::Other(e.into()))?;
-
-        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
-            b"valid_until",
-        )]))?
-        .write_message_noflush(&BeMessage::DataRow(&[Some(
-            &valid_until.as_millis().to_be_bytes(),
-        )]))?
-        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-
-        Ok(())
-    }
-
    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_rel_exists_request(
        &mut self,
@@ -1513,7 +1470,6 @@ where
        _pgb: &mut PostgresBackend<IO>,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
-        fail::fail_point!("ps::connection-start::startup-packet");
        Ok(())
    }

@@ -1528,12 +1484,11 @@ where
            Err(QueryError::SimulatedConnectionError)
        });

-        fail::fail_point!("ps::connection-start::process-query");
-
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-        let parts = query_string.split_whitespace().collect::<Vec<_>>();
-        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
+        if query_string.starts_with("pagestream_v2 ") {
+            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for pagestream command"
@@ -1558,7 +1513,9 @@ where
                ctx,
            )
            .await?;
-        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
+        } else if query_string.starts_with("pagestream ") {
+            let (_, params_raw) = query_string.split_at("pagestream ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for pagestream command"
@@ -1583,7 +1540,10 @@ where
                ctx,
            )
            .await?;
-        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
+        } else if query_string.starts_with("basebackup ") {
+            let (_, params_raw) = query_string.split_at("basebackup ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
            if params.len() < 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for basebackup command"
@@ -1601,23 +1561,26 @@ where

            self.check_permission(Some(tenant_id))?;

-            let lsn = if let Some(lsn_str) = params.get(2) {
+            let lsn = if params.len() >= 3 {
                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
                )
            } else {
                None
            };

-            let gzip = match params.get(3) {
-                Some(&"--gzip") => true,
-                None => false,
-                Some(third_param) => {
+            let gzip = if params.len() >= 4 {
+                if params[3] == "--gzip" {
+                    true
+                } else {
                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {third_param}",
-                    )))
+                        "Parameter in position 3 unknown {}",
+                        params[3],
+                    )));
                }
+            } else {
+                false
            };

            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1641,7 +1604,10 @@ where
            res?;
        }
        // return pair of prev_lsn and last_lsn
-        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
+        else if query_string.starts_with("get_last_record_rlsn ") {
+            let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
            if params.len() != 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for get_last_record_rlsn command"
@@ -1683,7 +1649,10 @@ where
            .await?;
        }
        // same as basebackup, but result includes relational data as well
-        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
+        else if query_string.starts_with("fullbackup ") {
+            let (_, params_raw) = query_string.split_at("fullbackup ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
            if params.len() < 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for fullbackup command"
@@ -1700,18 +1669,18 @@ where
                .record("timeline_id", field::display(timeline_id));

            // The caller is responsible for providing correct lsn and prev_lsn.
-            let lsn = if let Some(lsn_str) = params.get(2) {
+            let lsn = if params.len() > 2 {
                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
                )
            } else {
                None
            };
-            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
+            let prev_lsn = if params.len() > 3 {
                Some(
-                    Lsn::from_str(prev_lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
+                    Lsn::from_str(params[3])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
                )
            } else {
                None
@@ -1744,7 +1713,8 @@ where
            // 2. Run:
            // cat my_backup/base.tar | psql -h $PAGESERVER \
            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let params = &parts[2..];
+            let (_, params_raw) = query_string.split_at("import basebackup ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
            if params.len() != 5 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for import basebackup command"
@@ -1793,7 +1763,8 @@ where
            //
            // Files are scheduled to be persisted to remote storage, and the
            // caller should poll the http api to check when that is done.
-            let params = &parts[2..];
+            let (_, params_raw) = query_string.split_at("import wal ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
            if params.len() != 4 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for import wal command"
@@ -1831,45 +1802,10 @@ where
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("lease lsn ") {
-            let params = &parts[2..];
-            if params.len() != 3 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number {} for lease lsn command",
-                    params.len()
-                )));
-            }
-
-            let tenant_shard_id = TenantShardId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_shard_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_shard_id.tenant_id))?;
-
-            // The caller is responsible for providing correct lsn.
-            let lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-
-            match self
-                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error obtaining lsn lease for {lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else if let Some(params) = parts.strip_prefix(&["show"]) {
+        } else if query_string.starts_with("show ") {
            // show <tenant_id>
+            let (_, params_raw) = query_string.split_at("show ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 1 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for config command"
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,6 +9,7 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::WAL_INGEST;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
@@ -1480,24 +1481,11 @@ impl<'a> DatadirModification<'a> {
            // Allowed switch path:
            // * no aux files -> v1/v2/cross-validation
            // * cross-validation->v2
-
-            let current_policy = if current_policy.is_none() {
-                // This path will only be hit once per tenant: we will decide the final policy in this code block.
-                // The next call to `put_file` will always have `last_aux_file_policy != None`.
-                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
-                if aux_files_key_v1.is_empty() {
-                    None
-                } else {
-                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
-                    Some(AuxFilePolicy::V1)
-                }
-            } else {
-                current_policy
-            };
-
            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.do_switch_aux_policy(switch_policy)?;
+                self.tline.last_aux_file_policy.store(Some(switch_policy));
+                self.tline
+                    .remote_client
+                    .schedule_index_upload_for_aux_file_policy_update(Some(switch_policy))?;
                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
                switch_policy
            } else {
@@ -1714,6 +1702,8 @@ impl<'a> DatadirModification<'a> {
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let mut writer = self.tline.writer().await;

+        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
+
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

@@ -1753,6 +1743,8 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

+        timer.observe_duration();
+
        Ok(())
    }

@@ -1788,12 +1780,6 @@ impl<'a> DatadirModification<'a> {
        self.tline.get(key, lsn, ctx).await
    }

-    /// Only used during unit tests, force putting a key into the modification.
-    #[cfg(test)]
-    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
-        self.put(key, val);
-    }
-
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3964,20 +3964,18 @@ mod tests {

    use super::*;
    use crate::keyspace::KeySpaceAccum;
-    use crate::pgdatadir_mapping::AuxFilesDirectory;
    use crate::repository::{Key, Value};
    use crate::tenant::harness::*;
    use crate::tenant::timeline::CompactFlags;
    use crate::DEFAULT_PG_VERSION;
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
-    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
+    use pageserver_api::models::CompactionAlgorithm;
    use rand::{thread_rng, Rng};
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
-    use utils::bin_ser::BeSer;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -5169,9 +5167,7 @@ mod tests {
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
-            kind: compaction_algorithm,
-        };
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
@@ -5528,9 +5524,7 @@ mod tests {
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
-            kind: compaction_algorithm,
-        };
+        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
@@ -6003,130 +5997,6 @@ mod tests {
        );
    }

-    #[tokio::test]
-    async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "dirty index_part.json reflected state is yet to be updated"
-        );
-
-        // lose all data from v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"second", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        // read data ingested in v2
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"second"))
-        );
-        // lose all data from v1
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: vec![(
-                    "test_file".to_string(),
-                    Bytes::copy_from_slice(b"test_file"),
-                )]
-                .into_iter()
-                .collect(),
-            })
-            .unwrap();
-            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep using v1 because there are aux files writting with v1"
-        );
-
-        // we can still read the auxfile v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("test_file"),
-            Some(&bytes::Bytes::from_static(b"test_file"))
-        );
-    }
-
    #[tokio::test]
    async fn test_metadata_image_creation() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_metadata_image_creation")?;
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        io_buf,
                        Err(Error::new(
                            ErrorKind::Other,
-                            format!("blob too large ({len} bytes)"),
+                            format!("blob too large ({} bytes)", len),
                        )),
                    );
                }
-                if len > 0x0fff_ffff {
-                    tracing::warn!("writing blob above future limit ({len} bytes)");
-                }
                let mut len_buf = (len as u32).to_be_bytes();
                len_buf[0] |= 0x80;
                io_buf.extend_from_slice(&len_buf[..]);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,6 @@
 use anyhow::bail;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
-use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -321,7 +320,7 @@ pub struct TenantConf {
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithmSettings,
+    pub compaction_algorithm: CompactionAlgorithm,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -407,7 +406,7 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub compaction_algorithm: Option<CompactionAlgorithm>,

    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
@@ -498,9 +497,7 @@ impl TenantConfOpt {
                .unwrap_or(global_conf.compaction_threshold),
            compaction_algorithm: self
                .compaction_algorithm
-                .as_ref()
-                .unwrap_or(&global_conf.compaction_algorithm)
-                .clone(),
+                .unwrap_or(global_conf.compaction_algorithm),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -553,9 +550,7 @@ impl Default for TenantConf {
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: DEFAULT_COMPACTION_ALGORITHM,
-            },
+            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,7 +7,7 @@ use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
+    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
@@ -127,8 +127,6 @@ pub(crate) enum ShardSelector {
    First,
    /// Pick the shard that holds this key
    Page(Key),
-    /// The shard ID is known: pick the given shard
-    Known(ShardIndex),
 }

 /// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -2069,11 +2067,6 @@ impl TenantManager {
                                return ShardResolveResult::Found(tenant.clone());
                            }
                        }
-                        ShardSelector::Known(shard)
-                            if tenant.shard_identity.shard_index() == shard =>
-                        {
-                            return ShardResolveResult::Found(tenant.clone());
-                        }
                        _ => continue,
                    }
                }
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1192,7 +1192,7 @@ impl RemoteTimelineClient {
                    &self.storage_impl,
                    uploaded.local_path(),
                    &remote_path,
-                    uploaded.metadata().file_size,
+                    uploaded.metadata().file_size(),
                    cancel,
                )
                .await
@@ -1573,7 +1573,7 @@ impl RemoteTimelineClient {
                        &self.storage_impl,
                        local_path,
                        &remote_path,
-                        layer_metadata.file_size,
+                        layer_metadata.file_size(),
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1768,7 +1768,7 @@ impl RemoteTimelineClient {
            UploadOp::UploadLayer(_, m) => (
                RemoteOpFileKind::Layer,
                RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
            ),
            UploadOp::UploadMetadata(_, _) => (
                RemoteOpFileKind::Index,
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -84,7 +84,7 @@ pub async fn download_layer_file<'a>(
    )
    .await?;

-    let expected = layer_metadata.file_size;
+    let expected = layer_metadata.file_size();
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -17,6 +17,46 @@ use pageserver_api::shard::ShardIndex;

 use utils::lsn::Lsn;

+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+//#[cfg_attr(test, derive(Default))]
+pub struct LayerFileMetadata {
+    file_size: u64,
+
+    pub(crate) generation: Generation,
+
+    pub(crate) shard: ShardIndex,
+}
+
+impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
+    fn from(other: &IndexLayerMetadata) -> Self {
+        LayerFileMetadata {
+            file_size: other.file_size,
+            generation: other.generation,
+            shard: other.shard,
+        }
+    }
+}
+
+impl LayerFileMetadata {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+            shard,
+        }
+    }
+
+    pub fn file_size(&self) -> u64 {
+        self.file_size
+    }
+}
+
+// TODO seems like another part of the remote storage file format
+// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -37,7 +77,7 @@ pub struct IndexPart {
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
-    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
+    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
@@ -87,7 +127,10 @@ impl IndexPart {
        lineage: Lineage,
        last_aux_file_policy: Option<AuxFilePolicy>,
    ) -> Self {
-        let layer_metadata = layers_and_metadata.clone();
+        let layer_metadata = layers_and_metadata
+            .iter()
+            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
+            .collect();

        Self {
            version: Self::LATEST_VERSION,
@@ -151,12 +194,9 @@ impl From<&UploadQueueInitialized> for IndexPart {
    }
 }

-/// Metadata gathered for each of the layer files.
-///
-/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
-/// might have less or more metadata depending if upgrading or rolling back an upgrade.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
-pub struct LayerFileMetadata {
+/// Serialized form of [`LayerFileMetadata`].
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct IndexLayerMetadata {
    pub file_size: u64,

    #[serde(default = "Generation::none")]
@@ -168,12 +208,12 @@ pub struct LayerFileMetadata {
    pub shard: ShardIndex,
 }

-impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-            shard,
+impl From<&LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &LayerFileMetadata) -> Self {
+        IndexLayerMetadata {
+            file_size: other.file_size,
+            generation: other.generation,
+            shard: other.shard,
        }
    }
 }
@@ -267,12 +307,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -309,12 +349,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -352,12 +392,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -440,12 +480,12 @@ mod tests {
        let expected = IndexPart {
            version: 4,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -482,12 +522,12 @@ mod tests {
        let expected = IndexPart {
            version: 5,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
                    file_size: 23289856,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
                    file_size: 1015808,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
@@ -529,12 +569,12 @@ mod tests {
        let expected = IndexPart {
            version: 6,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -45,10 +45,10 @@ use crate::tenant::{

 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
-use futures::Future;
+use futures::{Future, StreamExt};
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, Etag, GenericRemoteStorage, RemoteStorageActivity};

 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -67,6 +67,12 @@ use super::{
 /// download, if the uploader populated it.
 const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);

+/// Range of concurrency we may use when downloading layers within a timeline.  This is independent
+/// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
+/// `PageServerConf::secondary_download_concurrency`
+const MAX_LAYER_CONCURRENCY: usize = 16;
+const MIN_LAYER_CONCURRENCY: usize = 1;
+
 pub(super) async fn downloader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
@@ -75,18 +81,19 @@ pub(super) async fn downloader_task(
    cancel: CancellationToken,
    root_ctx: RequestContext,
 ) {
-    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+    // How many tenants' secondary download operations we will run concurrently
+    let tenant_concurrency = tenant_manager.get_conf().secondary_download_concurrency;

    let generator = SecondaryDownloader {
        tenant_manager,
        remote_storage,
        root_ctx,
    };
-    let mut scheduler = Scheduler::new(generator, concurrency);
+    let mut scheduler = Scheduler::new(generator, tenant_concurrency);

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_download_scheduler"))
+        .instrument(info_span!("secondary_downloads"))
        .await
 }

@@ -407,7 +414,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
                }
                Err(UpdateError::Cancelled) => {
-                    tracing::info!("Shut down while downloading");
+                    tracing::debug!("Shut down while downloading");
                },
                Err(UpdateError::Deserialize(e)) => {
                    tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -562,39 +569,6 @@ impl<'a> TenantDownloader<'a> {
            heatmap.timelines.len()
        );

-        // Get or initialize the local disk state for the timelines we will update
-        let mut timeline_states = HashMap::new();
-        for timeline in &heatmap.timelines {
-            let timeline_state = self
-                .secondary_state
-                .detail
-                .lock()
-                .unwrap()
-                .timelines
-                .get(&timeline.timeline_id)
-                .cloned();
-
-            let timeline_state = match timeline_state {
-                Some(t) => t,
-                None => {
-                    // We have no existing state: need to scan local disk for layers first.
-                    let timeline_state =
-                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
-
-                    // Re-acquire detail lock now that we're done with async load from local FS
-                    self.secondary_state
-                        .detail
-                        .lock()
-                        .unwrap()
-                        .timelines
-                        .insert(timeline.timeline_id, timeline_state.clone());
-                    timeline_state
-                }
-            };
-
-            timeline_states.insert(timeline.timeline_id, timeline_state);
-        }
-
        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
        // principle that deletions should be done before writes wherever possible, and so that we can use this
        // phase to initialize our SecondaryProgress.
@@ -605,10 +579,6 @@ impl<'a> TenantDownloader<'a> {

        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
-            let timeline_state = timeline_states
-                .remove(&timeline.timeline_id)
-                .expect("Just populated above");
-
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!(
                    "Cancelled before downloading timeline {}",
@@ -618,7 +588,7 @@ impl<'a> TenantDownloader<'a> {
            }

            let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, timeline_state, ctx)
+            self.download_timeline(timeline, ctx)
                .instrument(tracing::info_span!(
                    "secondary_download_timeline",
                    tenant_id=%tenant_shard_id.tenant_id,
@@ -639,22 +609,6 @@ impl<'a> TenantDownloader<'a> {
                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
        });

-        // Robustness: we should have updated progress properly, but in case we didn't, make sure
-        // we don't leave the tenant in a state where we claim to have successfully downloaded
-        // everything, but our progress is incomplete.  The invariant here should be that if
-        // we have set `last_download` to this heatmap's etag, then the next time we see that
-        // etag we can safely do no work (i.e. we must be complete).
-        let mut progress = self.secondary_state.progress.lock().unwrap();
-        debug_assert!(progress.layers_downloaded == progress.layers_total);
-        debug_assert!(progress.bytes_downloaded == progress.bytes_total);
-        if progress.layers_downloaded != progress.layers_total
-            || progress.bytes_downloaded != progress.bytes_total
-        {
-            tracing::warn!("Correcting drift in progress stats ({progress:?})");
-            progress.layers_downloaded = progress.layers_total;
-            progress.bytes_downloaded = progress.bytes_total;
-        }
-
        Ok(())
    }

@@ -709,7 +663,7 @@ impl<'a> TenantDownloader<'a> {
                let mut layer_byte_count: u64 = timeline_state
                    .on_disk_layers
                    .values()
-                    .map(|l| l.metadata.file_size)
+                    .map(|l| l.metadata.file_size())
                    .sum();

                // Remove on-disk layers that are no longer present in heatmap
@@ -720,7 +674,7 @@ impl<'a> TenantDownloader<'a> {
                        .get(layer_file_name)
                        .unwrap()
                        .metadata
-                        .file_size;
+                        .file_size();

                    let local_path = local_layer_path(
                        self.conf,
@@ -830,7 +784,6 @@ impl<'a> TenantDownloader<'a> {
    async fn download_timeline(
        &self,
        timeline: HeatMapTimeline,
-        timeline_state: SecondaryDetailTimeline,
        ctx: &RequestContext,
    ) -> Result<(), UpdateError> {
        debug_assert_current_span_has_tenant_and_timeline_id();
@@ -839,8 +792,38 @@ impl<'a> TenantDownloader<'a> {
        // Accumulate updates to the state
        let mut touched = Vec::new();

+        // Clone a view of what layers already exist on disk
+        let timeline_state = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                let timeline_state =
+                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
+
+                // Re-acquire detail lock now that we're done with async load from local FS
+                self.secondary_state
+                    .detail
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(timeline.timeline_id, timeline_state.clone());
+                timeline_state
+            }
+        };
+
        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());

+        let mut download_futs = Vec::new();
+
        // Download heatmap layers that are not present on local disk, or update their
        // access time if they are already present.
        for layer in timeline.layers {
@@ -877,7 +860,9 @@ impl<'a> TenantDownloader<'a> {
                    }
                }

-                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
+                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
                    // We already have this layer on disk.  Update its access time.
                    tracing::debug!(
                        "Access time updated for layer {}: {} -> {}",
@@ -913,14 +898,31 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            match self
-                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
-                .await?
-            {
-                Some(layer) => touched.push(layer),
-                None => {
-                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
-                    // things to consider touched.
+            download_futs.push(self.download_layer(
+                tenant_shard_id,
+                &timeline.timeline_id,
+                layer,
+                ctx,
+            ));
+        }
+
+        // Break up layer downloads into chunks, so that for each chunk we can re-check how much
+        // concurrency to use based on activity level of remote storage.
+        while !download_futs.is_empty() {
+            let chunk =
+                download_futs.split_off(download_futs.len().saturating_sub(MAX_LAYER_CONCURRENCY));
+
+            let concurrency = Self::layer_concurrency(self.remote_storage.activity());
+
+            let mut result_stream = futures::stream::iter(chunk).buffered(concurrency);
+            let mut result_stream = std::pin::pin!(result_stream);
+            while let Some(result) = result_stream.next().await {
+                match result {
+                    Err(e) => return Err(e),
+                    Ok(None) => {
+                        // No error, but we didn't download the layer.  Don't mark it touched
+                    }
+                    Ok(Some(layer)) => touched.push(layer),
                }
            }
        }
@@ -951,7 +953,7 @@ impl<'a> TenantDownloader<'a> {
                            tenant_shard_id,
                            &timeline.timeline_id,
                            t.name,
-                            t.metadata.clone(),
+                            LayerFileMetadata::from(&t.metadata),
                            t.access_time,
                            local_path,
                        ));
@@ -985,18 +987,13 @@ impl<'a> TenantDownloader<'a> {
        );

        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-        tracing::info!(
-            "Starting download of layer {}, size {}",
-            layer.name,
-            layer.metadata.file_size
-        );
        let downloaded_bytes = match download_layer_file(
            self.conf,
            self.remote_storage,
            *tenant_shard_id,
            *timeline_id,
            &layer.name,
-            &layer.metadata,
+            &LayerFileMetadata::from(&layer.metadata),
            &local_path,
            &self.secondary_state.cancel,
            ctx,
@@ -1012,14 +1009,6 @@ impl<'a> TenantDownloader<'a> {
                    "Skipped downloading missing layer {}, raced with compaction/gc?",
                    layer.name
                );
-
-                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.layers_total = progress.layers_total.saturating_sub(1);
-                progress.bytes_total = progress
-                    .bytes_total
-                    .saturating_sub(layer.metadata.file_size);
-
                return Ok(None);
            }
            Err(e) => return Err(e.into()),
@@ -1055,6 +1044,19 @@ impl<'a> TenantDownloader<'a> {

        Ok(Some(layer))
    }
+
+    /// Calculate the currently allowed parallelism of layer download tasks, based on activity level of the remote storage
+    fn layer_concurrency(activity: RemoteStorageActivity) -> usize {
+        // When less than 75% of units are available, use minimum concurrency.  Else, do a linear mapping
+        // of our concurrency range to the units available within the remaining 25%.
+        let clamp_at = (activity.read_total * 3) / 4;
+        if activity.read_available > clamp_at {
+            (MAX_LAYER_CONCURRENCY * (activity.read_available - clamp_at))
+                / (activity.read_total - clamp_at)
+        } else {
+            MIN_LAYER_CONCURRENCY
+        }
+    }
 }

 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1144,7 +1146,7 @@ async fn init_timeline_state(
                                    tenant_shard_id,
                                    &heatmap.timeline_id,
                                    name,
-                                    remote_meta.metadata.clone(),
+                                    LayerFileMetadata::from(&remote_meta.metadata),
                                    remote_meta.access_time,
                                    file_path,
                                ),
@@ -1178,3 +1180,58 @@ async fn init_timeline_state(

    detail
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn layer_concurrency() {
+        // Totally idle
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 16,
+                read_total: 16,
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY
+        );
+
+        // Totally busy
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 0,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Edge of the range at which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 12,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MIN_LAYER_CONCURRENCY
+        );
+
+        // Midpoint of the range in which we interpolate
+        assert_eq!(
+            TenantDownloader::layer_concurrency(RemoteStorageActivity {
+                read_available: 14,
+                read_total: 16,
+
+                write_available: 16,
+                write_total: 16
+            }),
+            MAX_LAYER_CONCURRENCY / 2
+        );
+    }
+}
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,6 +1,6 @@
 use std::time::SystemTime;

-use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
+use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};

 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -38,7 +38,7 @@ pub(crate) struct HeatMapTimeline {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
    pub(super) name: LayerName,
-    pub(super) metadata: LayerFileMetadata,
+    pub(super) metadata: IndexLayerMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(super) access_time: SystemTime,
@@ -49,7 +49,7 @@ pub(crate) struct HeatMapLayer {
 impl HeatMapLayer {
    pub(crate) fn new(
        name: LayerName,
-        metadata: LayerFileMetadata,
+        metadata: IndexLayerMetadata,
        access_time: SystemTime,
    ) -> Self {
        Self {
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_upload_scheduler"))
+        .instrument(info_span!("heatmap_uploader"))
        .await
 }

--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -179,13 +179,6 @@ where
            // Schedule some work, if concurrency limit permits it
            self.spawn_pending();

-            // This message is printed every scheduling iteration as proof of liveness when looking at logs
-            tracing::info!(
-                "Status: {} tasks running, {} pending",
-                self.running.len(),
-                self.pending.len()
-            );
-
            // Between scheduling iterations, we will:
            //  - Drain any complete tasks and spawn pending tasks
            //  - Handle incoming administrative commands
@@ -265,11 +258,7 @@ where

        self.tasks.spawn(fut);

-        let replaced = self.running.insert(tenant_shard_id, in_progress);
-        debug_assert!(replaced.is_none());
-        if replaced.is_some() {
-            tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
-        }
+        self.running.insert(tenant_shard_id, in_progress);
    }

    /// For all pending tenants that are elegible for execution, spawn their task.
@@ -279,9 +268,7 @@ where
        while !self.pending.is_empty() && self.running.len() < self.concurrency {
            // unwrap: loop condition includes !is_empty()
            let pending = self.pending.pop_front().unwrap();
-            if !self.running.contains_key(pending.get_tenant_shard_id()) {
-                self.do_spawn(pending);
-            }
+            self.do_spawn(pending);
        }
    }

@@ -334,8 +321,7 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                           "Command already running, waiting for it");
+            tracing::info!("Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -47,7 +47,7 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
-use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -473,7 +473,7 @@ impl ImageLayerInner {
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        let reads = self
-            .plan_reads(keyspace, None, ctx)
+            .plan_reads(keyspace, ctx)
            .await
            .map_err(GetVectoredError::Other)?;

@@ -485,15 +485,9 @@ impl ImageLayerInner {
        Ok(())
    }

-    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
-    /// and the keys in this layer.
-    ///
-    /// If shard_identity is provided, it will be used to filter keys down to those stored on
-    /// this shard.
    async fn plan_reads(
        &self,
        keyspace: KeySpace,
-        shard_identity: Option<&ShardIdentity>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<VectoredRead>> {
        let mut planner = VectoredReadPlanner::new(
@@ -513,6 +507,7 @@ impl ImageLayerInner {

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;
+
            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

@@ -525,22 +520,12 @@ impl ImageLayerInner {
                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                assert!(key >= range.start);

-                let flag = if let Some(shard_identity) = shard_identity {
-                    if shard_identity.is_key_disposable(&key) {
-                        BlobFlag::Ignore
-                    } else {
-                        BlobFlag::None
-                    }
-                } else {
-                    BlobFlag::None
-                };
-
                if key >= range.end {
                    planner.handle_range_end(offset);
                    range_end_handled = true;
                    break;
                } else {
-                    planner.handle(key, self.lsn, offset, flag);
+                    planner.handle(key, self.lsn, offset, BlobFlag::None);
                }
            }

@@ -553,50 +538,6 @@ impl ImageLayerInner {
        Ok(planner.finish())
    }

-    /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
-    /// then execute vectored GET operations, passing the results of all read keys into the writer.
-    pub(super) async fn filter(
-        &self,
-        shard_identity: &ShardIdentity,
-        writer: &mut ImageLayerWriter,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<usize> {
-        // Fragment the range into the regions owned by this ShardIdentity
-        let plan = self
-            .plan_reads(
-                KeySpace {
-                    // If asked for the total key space, plan_reads will give us all the keys in the layer
-                    ranges: vec![Key::MIN..Key::MAX],
-                },
-                Some(shard_identity),
-                ctx,
-            )
-            .await?;
-
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
-        let mut key_count = 0;
-        for read in plan.into_iter() {
-            let buf_size = read.size();
-
-            let buf = BytesMut::with_capacity(buf_size);
-            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
-            let frozen_buf = blobs_buf.buf.freeze();
-
-            for meta in blobs_buf.blobs.iter() {
-                let img_buf = frozen_buf.slice(meta.start..meta.end);
-
-                key_count += 1;
-                writer
-                    .put_image(meta.meta.key, img_buf, ctx)
-                    .await
-                    .context(format!("Storing key {}", meta.meta.key))?;
-            }
-        }
-
-        Ok(key_count)
-    }
-
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -709,7 +650,7 @@ impl ImageLayerWriterInner {
                lsn,
            },
        );
-        trace!("creating image layer {}", path);
+        info!("new image layer {path}");
        let mut file = {
            VirtualFile::open_with_options(
                &path,
@@ -829,7 +770,7 @@ impl ImageLayerWriterInner {
        // FIXME: why not carry the virtualfile here, it supports renaming?
        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        info!("created image layer {}", layer.local_path());
+        trace!("created image layer {}", layer.local_path());

        Ok(layer)
    }
@@ -914,136 +855,3 @@ impl Drop for ImageLayerWriter {
        }
    }
 }
-
-#[cfg(test)]
-mod test {
-    use bytes::Bytes;
-    use pageserver_api::{
-        key::Key,
-        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
-    };
-    use utils::{id::TimelineId, lsn::Lsn};
-
-    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
-
-    use super::ImageLayerWriter;
-
-    #[tokio::test]
-    async fn image_layer_rewrite() {
-        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        // The LSN at which we will create an image layer to filter
-        let lsn = Lsn(0xdeadbeef0000);
-
-        let timeline_id = TimelineId::generate();
-        let timeline = tenant
-            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
-        let range = input_start..input_end;
-
-        // Build an image layer to filter
-        let resident = {
-            let mut writer = ImageLayerWriter::new(
-                harness.conf,
-                timeline_id,
-                harness.tenant_shard_id,
-                &range,
-                lsn,
-                &ctx,
-            )
-            .await
-            .unwrap();
-
-            let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
-            let mut key = range.start;
-            while key < range.end {
-                writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
-
-                key = key.next();
-            }
-            writer.finish(&timeline, &ctx).await.unwrap()
-        };
-        let original_size = resident.metadata().file_size;
-
-        // Filter for various shards: this exercises cases like values at start of key range, end of key
-        // range, middle of key range.
-        for shard_number in 0..4 {
-            let mut filtered_writer = ImageLayerWriter::new(
-                harness.conf,
-                timeline_id,
-                harness.tenant_shard_id,
-                &range,
-                lsn,
-                &ctx,
-            )
-            .await
-            .unwrap();
-
-            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
-            // to exercise filter()
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                ShardCount::new(4),
-                ShardStripeSize(0x8000),
-            )
-            .unwrap();
-
-            let wrote_keys = resident
-                .filter(&shard_identity, &mut filtered_writer, &ctx)
-                .await
-                .unwrap();
-            let replacement = if wrote_keys > 0 {
-                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
-            } else {
-                None
-            };
-
-            // This exact size and those below will need updating as/when the layer encoding changes, but
-            // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
-
-            match shard_number {
-                0 => {
-                    // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
-                    let replacement = replacement.unwrap();
-
-                    // We should have dropped some of the data
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-
-                    // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
-                }
-                1 => {
-                    // Shard 1 has no keys in our input range
-                    assert_eq!(wrote_keys, 0x0);
-                    assert!(replacement.is_none());
-                }
-                2 => {
-                    // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
-                    let replacement = replacement.unwrap();
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
-                }
-                3 => {
-                    // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
-                    let replacement = replacement.unwrap();
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
-                }
-                _ => unreachable!(),
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
-use utils::sync::{gate, heavier_once_cell};
+use utils::sync::heavier_once_cell;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
-use super::image_layer::{self};
+use super::image_layer;
 use super::{
-    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
+    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -161,7 +161,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size,
+            metadata.file_size(),
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +194,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size,
+            metadata.file_size(),
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +227,7 @@ impl Layer {

        timeline
            .metrics
-            .resident_physical_size_add(metadata.file_size);
+            .resident_physical_size_add(metadata.file_size());

        ResidentLayer { downloaded, owner }
    }
@@ -1264,7 +1264,6 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
@@ -1333,7 +1332,7 @@ impl LayerInner {

        is_good_to_continue(&rx.borrow_and_update())?;

-        let Ok(gate) = timeline.gate.enter() else {
+        let Ok(_gate) = timeline.gate.enter() else {
            return Err(EvictionCancelled::TimelineGone);
        };

@@ -1421,7 +1420,7 @@ impl LayerInner {
        Self::spawn_blocking(move || {
            let _span = span.entered();

-            let res = self.evict_blocking(&timeline, &gate, &permit);
+            let res = self.evict_blocking(&timeline, &permit);

            let waiters = self.inner.initializer_count();

@@ -1447,7 +1446,6 @@ impl LayerInner {
    fn evict_blocking(
        &self,
        timeline: &Timeline,
-        _gate: &gate::GateGuard,
        _permit: &heavier_once_cell::InitPermit,
    ) -> Result<(), EvictionCancelled> {
        // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
@@ -1802,15 +1800,16 @@ impl ResidentLayer {
        use LayerKind::*;

        let owner = &self.owner.0;
+
        match self.downloaded.get(owner, ctx).await? {
            Delta(ref d) => {
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
                owner
                    .access_stats
                    .record_access(LayerAccessKind::KeyIter, ctx);

+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
                    .with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1819,23 +1818,6 @@ impl ResidentLayer {
        }
    }

-    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
-    /// the provided writer.  Return the number of keys written.
-    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
-    pub(crate) async fn filter<'a>(
-        &'a self,
-        shard_identity: &ShardIdentity,
-        writer: &mut ImageLayerWriter,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<usize> {
-        use LayerKind::*;
-
-        match self.downloaded.get(&self.owner.0, ctx).await? {
-            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
-            Image(i) => i.filter(shard_identity, writer, ctx).await,
-        }
-    }
-
    /// Returns the amount of keys and values written to the writer.
    pub(crate) async fn copy_delta_prefix(
        &self,
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -347,33 +347,37 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
 mod test {
    use super::*;
    #[test]
-    fn image_layer_parse() {
+    fn image_layer_parse() -> anyhow::Result<()> {
        let expected = LayerName::Image(ImageLayerName {
            key_range: Key::from_i128(0)
                ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
            lsn: Lsn::from_hex("00000000014FED58").unwrap(),
        });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected,);

        // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected,);
+
+        Ok(())
    }

    #[test]
-    fn delta_layer_parse() {
+    fn delta_layer_parse() -> anyhow::Result<()> {
        let expected = LayerName::Delta(DeltaLayerName {
            key_range: Key::from_i128(0)
                ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
            lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
                ..Lsn::from_hex("000000000154C481").unwrap(),
        });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected);

        // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected);
+
+        Ok(())
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,9 +23,9 @@ use pageserver_api::{
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
-        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
+        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
+        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
+        TimelineState,
    },
    reltag::BlockNumber,
    shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -41,7 +41,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
-    fs_ext,
    sync::gate::{Gate, GateGuard},
    vec_map::VecMap,
 };
@@ -61,7 +60,6 @@ use std::{
    ops::ControlFlow,
 };

-use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
@@ -90,6 +88,9 @@ use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
+};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -268,8 +269,6 @@ pub struct Timeline {
    // Atomic would be more appropriate here.
    last_freeze_ts: RwLock<Instant>,

-    pub(crate) standby_horizon: AtomicLsn,
-
    // WAL redo manager. `None` only for broken tenants.
    walredo_mgr: Option<Arc<super::WalRedoManager>>,

@@ -1423,7 +1422,7 @@ impl Timeline {
        let layer_map = guard.layer_map();
        let mut size = 0;
        for l in layer_map.iter_historic_layers() {
-            size += l.file_size;
+            size += l.file_size();
        }
        size
    }
@@ -1531,20 +1530,6 @@ impl Timeline {
        Ok(())
    }

-    /// Obtains a temporary lease blocking garbage collection for the given LSN
-    pub(crate) fn make_lsn_lease(
-        &self,
-        _lsn: Lsn,
-        _ctx: &RequestContext,
-    ) -> anyhow::Result<LsnLease> {
-        const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
-        let lease = LsnLease {
-            valid_until: SystemTime::now() + LEASE_LENGTH,
-        };
-        // TODO: dummy implementation
-        Ok(lease)
-    }
-
    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
@@ -1699,7 +1684,7 @@ impl Timeline {
            return Ok(());
        }

-        match self.get_compaction_algorithm_settings().kind {
+        match self.get_compaction_algorithm() {
            CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
            CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await,
        }
@@ -2095,14 +2080,12 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

-    fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
+    fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
        let tenant_conf = &self.tenant_conf.load();
        tenant_conf
            .tenant_conf
            .compaction_algorithm
-            .as_ref()
-            .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm)
-            .clone()
+            .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
@@ -2296,8 +2279,6 @@ impl Timeline {
                compaction_lock: tokio::sync::Mutex::default(),
                gc_lock: tokio::sync::Mutex::default(),

-                standby_horizon: AtomicLsn::new(0),
-
                timeline_get_throttle: resources.timeline_get_throttle,

                aux_files: tokio::sync::Mutex::new(AuxFilesState {
@@ -2453,6 +2434,8 @@ impl Timeline {
        let span = tracing::Span::current();

        // Copy to move into the task we're about to spawn
+        let generation = self.generation;
+        let shard = self.get_shard_index();
        let this = self.myself.upgrade().expect("&self method holds the arc");

        let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
@@ -2466,14 +2449,11 @@ impl Timeline {

                for discovered in discovered {
                    let (name, kind) = match discovered {
-                        Discovered::Layer(layer_file_name, local_metadata) => {
-                            discovered_layers.push((layer_file_name, local_metadata));
+                        Discovered::Layer(layer_file_name, local_path, file_size) => {
+                            discovered_layers.push((layer_file_name, local_path, file_size));
                            continue;
                        }
-                        Discovered::IgnoredBackup(path) => {
-                            std::fs::remove_file(path)
-                                .or_else(fs_ext::ignore_not_found)
-                                .fatal_err("Removing .old file");
+                        Discovered::IgnoredBackup => {
                            continue;
                        }
                        Discovered::Unknown(file_name) => {
@@ -2499,8 +2479,13 @@ impl Timeline {
                    );
                }

-                let decided =
-                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
+                let decided = init::reconcile(
+                    discovered_layers,
+                    index_part.as_ref(),
+                    disk_consistent_lsn,
+                    generation,
+                    shard,
+                );

                let mut loaded_layers = Vec::new();
                let mut needs_cleanup = Vec::new();
@@ -2508,6 +2493,21 @@ impl Timeline {

                for (name, decision) in decided {
                    let decision = match decision {
+                        Ok(UseRemote { local, remote }) => {
+                            // Remote is authoritative, but we may still choose to retain
+                            // the local file if the contents appear to match
+                            if local.metadata.file_size() == remote.file_size() {
+                                // Use the local file, but take the remote metadata so that we pick up
+                                // the correct generation.
+                                UseLocal(LocalLayerFileMetadata {
+                                    metadata: remote,
+                                    local_path: local.local_path,
+                                })
+                            } else {
+                                init::cleanup_local_file_for_remote(&local, &remote)?;
+                                UseRemote { local, remote }
+                            }
+                        }
                        Ok(decision) => decision,
                        Err(DismissedLayer::Future { local }) => {
                            if let Some(local) = local {
@@ -2525,11 +2525,6 @@ impl Timeline {
                            // this file never existed remotely, we will have to do rework
                            continue;
                        }
-                        Err(DismissedLayer::BadMetadata(local)) => {
-                            init::cleanup_local_file_for_remote(&local)?;
-                            // this file never existed remotely, we will have to do rework
-                            continue;
-                        }
                    };

                    match &name {
@@ -2540,12 +2535,14 @@ impl Timeline {
                    tracing::debug!(layer=%name, ?decision, "applied");

                    let layer = match decision {
-                        Resident { local, remote } => {
-                            total_physical_size += local.file_size;
-                            Layer::for_resident(conf, &this, local.local_path, name, remote)
+                        UseLocal(local) => {
+                            total_physical_size += local.metadata.file_size();
+                            Layer::for_resident(conf, &this, local.local_path, name, local.metadata)
                                .drop_eviction_guard()
                        }
-                        Evicted(remote) => Layer::for_evicted(conf, &this, name, remote),
+                        Evicted(remote) | UseRemote { remote, .. } => {
+                            Layer::for_evicted(conf, &this, name, remote)
+                        }
                    };

                    loaded_layers.push(layer);
@@ -3054,7 +3051,7 @@ impl Timeline {

            HeatMapLayer::new(
                layer.layer_desc().layer_name(),
-                layer.metadata(),
+                (&layer.metadata()).into(),
                last_activity_ts,
            )
        });
@@ -4330,7 +4327,7 @@ impl Timeline {
        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();

        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        debug!(
+        info!(
            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
                total_key_retrieved={total_key_retrieved}"
@@ -4591,14 +4588,6 @@ impl Timeline {
    ) -> Result<Vec<TimelineId>, anyhow::Error> {
        detach_ancestor::complete(self, tenant, prepared, ctx).await
    }
-
-    /// Switch aux file policy and schedule upload to the index part.
-    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
-        self.last_aux_file_policy.store(Some(policy));
-        self.remote_client
-            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
-        Ok(())
-    }
 }

 /// Top-level failure to compact.
@@ -4708,16 +4697,11 @@ impl Timeline {

    async fn rewrite_layers(
        self: &Arc<Self>,
-        mut replace_layers: Vec<(Layer, ResidentLayer)>,
-        mut drop_layers: Vec<Layer>,
+        replace_layers: Vec<(Layer, ResidentLayer)>,
+        drop_layers: Vec<Layer>,
    ) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;

-        // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
-        // to avoid double-removing, and avoid rewriting something that was removed.
-        replace_layers.retain(|(l, _)| guard.contains(l));
-        drop_layers.retain(|l| guard.contains(l));
-
        guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics);

        let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect();
@@ -4860,32 +4844,7 @@ impl Timeline {
            (horizon_cutoff, pitr_cutoff, retain_lsns)
        };

-        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
-        let standby_horizon = self.standby_horizon.load();
-        // Hold GC for the standby, but as a safety guard do it only within some
-        // reasonable lag.
-        if standby_horizon != Lsn::INVALID {
-            if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
-                const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
-                if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
-                    new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
-                    trace!("holding off GC for standby apply LSN {}", standby_horizon);
-                } else {
-                    warn!(
-                        "standby is lagging for more than {}MB, not holding gc for it",
-                        MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
-                    )
-                }
-            }
-        }
-
-        // Reset standby horizon to ignore it if it is not updated till next GC.
-        // It is an easy way to unset it when standby disappears without adding
-        // more conf options.
-        self.standby_horizon.store(Lsn::INVALID);
-        self.metrics
-            .standby_horizon_gauge
-            .set(Lsn::INVALID.0 as i64);
+        let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);

        let res = self
            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
@@ -5592,6 +5551,26 @@ fn is_send() {
    _assert_send::<TimelineWriter<'_>>();
 }

+/// Add a suffix to a layer file's name: .{num}.old
+/// Uses the first available num (starts at 0)
+fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
+    let filename = path
+        .file_name()
+        .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
+    let mut new_path = path.to_owned();
+
+    for i in 0u32.. {
+        new_path.set_file_name(format!("{filename}.{i}.old"));
+        if !new_path.exists() {
+            std::fs::rename(path, &new_path)
+                .with_context(|| format!("rename {path:?} to {new_path:?}"))?;
+            return Ok(());
+        }
+    }
+
+    bail!("couldn't find an unused backup number for {:?}", path)
+}
+
 #[cfg(test)]
 mod tests {
    use utils::{id::TimelineId, lsn::Lsn};
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,10 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{
-    CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
-};
+use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -25,13 +22,14 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
-use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
+use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use crate::{page_cache, ZERO_PAGE};

 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -176,24 +174,13 @@ impl Timeline {
    async fn compact_shard_ancestors(
        self: &Arc<Self>,
        rewrite_max: usize,
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut drop_layers = Vec::new();
-        let mut layers_to_rewrite: Vec<Layer> = Vec::new();
+        let layers_to_rewrite: Vec<Layer> = Vec::new();

-        // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
-        // layer is behind this Lsn, it indicates that the layer is being retained beyond the
-        // pitr_interval, for example because a branchpoint references it.
-        //
-        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
-        // are rewriting layers.
-        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
-
-        tracing::info!(
-            "latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.pitr
-        );
+        // We will use the PITR cutoff as a condition for rewriting layers.
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;

        let layers = self.layers.read().await;
        for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -252,9 +239,9 @@ impl Timeline {

            // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
            // without incurring the I/O cost of a rewrite.
-            if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
-                debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
-                    layer_desc.get_lsn_range().end, *latest_gc_cutoff);
+            if layer_desc.get_lsn_range().end >= pitr_cutoff {
+                debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
+                    layer_desc.get_lsn_range().end, pitr_cutoff);
                continue;
            }

@@ -264,10 +251,13 @@ impl Timeline {
                continue;
            }

-            // Only rewrite layers if their generations differ.  This guarantees:
-            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
-            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
-            if layer.metadata().generation == self.generation {
+            // Only rewrite layers if they would have different remote paths: either they belong to this
+            // shard but an old generation, or they belonged to another shard.  This also implicitly
+            // guarantees that the layer is persistent in remote storage (as only remote persistent
+            // layers are carried across shard splits, any local-only layer would be in the current generation)
+            if layer.metadata().generation == self.generation
+                && layer.metadata().shard.shard_count == self.shard_identity.count
+            {
                debug!(%layer, "Skipping rewrite, is not from old generation");
                continue;
            }
@@ -280,69 +270,18 @@ impl Timeline {
            }

            // Fall through: all our conditions for doing a rewrite passed.
-            layers_to_rewrite.push(layer);
+            // TODO: implement rewriting
+            tracing::debug!(%layer, "Would rewrite layer");
        }

-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
        drop(layers);

-        let mut replace_image_layers = Vec::new();
-
-        for layer in layers_to_rewrite {
-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
-            let mut image_layer_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                &layer.layer_desc().key_range,
-                layer.layer_desc().image_layer_lsn(),
-                ctx,
-            )
-            .await?;
-
-            // Safety of layer rewrites:
-            // - We are writing to a different local file path than we are reading from, so the old Layer
-            //   cannot interfere with the new one.
-            // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
-            //   is different for two layers with the same name (in `ImageLayerInner::new` we always
-            //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
-            //   reading the index from one layer file, and then data blocks from the rewritten layer file.
-            // - Any readers that have a reference to the old layer will keep it alive until they are done
-            //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
-            //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
-            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
-            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
-            //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer.download_and_keep_resident().await?;
-
-            let keys_written = resident
-                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
-                .await?;
-
-            if keys_written > 0 {
-                let new_layer = image_layer_writer.finish(self, ctx).await?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
-                    layer.metadata().file_size,
-                    new_layer.metadata().file_size);
-
-                replace_image_layers.push((layer, new_layer));
-            } else {
-                // Drop the old layer.  Usually for this case we would already have noticed that
-                // the layer has no data for us with the ShardedRange check above, but
-                drop_layers.push(layer);
-            }
-        }
-
-        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
-        // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
-        // to remote index) and be removed. This is inefficient but safe.
-        fail::fail_point!("compact-shard-ancestors-localonly");
+        // TODO: collect layers to rewrite
+        let replace_layers = Vec::new();

        // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
-        self.rewrite_layers(replace_image_layers, drop_layers)
-            .await?;
-
-        fail::fail_point!("compact-shard-ancestors-enqueued");
+        self.rewrite_layers(replace_layers, drop_layers).await?;

        // We wait for all uploads to complete before finishing this compaction stage.  This is not
        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
@@ -350,8 +289,6 @@ impl Timeline {
        // load.
        self.remote_client.wait_completion().await?;

-        fail::fail_point!("compact-shard-ancestors-persistent");
-
        Ok(())
    }

@@ -1213,10 +1150,10 @@ impl TimelineAdaptor {
        lsn: Lsn,
        key_range: &Range<Key>,
        ctx: &RequestContext,
-    ) -> Result<(), CreateImageLayersError> {
+    ) -> Result<(), PageReconstructError> {
        let timer = self.timeline.metrics.create_images_time_histo.start_timer();

-        let image_layer_writer = ImageLayerWriter::new(
+        let mut image_layer_writer = ImageLayerWriter::new(
            self.timeline.conf,
            self.timeline.timeline_id,
            self.timeline.tenant_shard_id,
@@ -1227,34 +1164,47 @@ impl TimelineAdaptor {
        .await?;

        fail_point!("image-layer-writer-fail-before-finish", |_| {
-            Err(CreateImageLayersError::Other(anyhow::anyhow!(
+            Err(PageReconstructError::Other(anyhow::anyhow!(
                "failpoint image-layer-writer-fail-before-finish"
            )))
        });
-
-        let keyspace = KeySpace {
-            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
-        };
-        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
-        let start = Key::MIN;
-        let ImageLayerCreationOutcome {
-            image,
-            next_start_key: _,
-        } = self
-            .timeline
-            .create_image_layer_for_rel_blocks(
-                &keyspace,
-                image_layer_writer,
-                lsn,
-                ctx,
-                key_range.clone(),
-                start,
-            )
-            .await?;
-
-        if let Some(image_layer) = image {
-            self.new_images.push(image_layer);
+        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
+        for range in &keyspace_ranges {
+            let mut key = range.start;
+            while key < range.end {
+                let img = match self.timeline.get(key, lsn, ctx).await {
+                    Ok(img) => img,
+                    Err(err) => {
+                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                        // page without losing any actual user data. That seems better
+                        // than failing repeatedly and getting stuck.
+                        //
+                        // We had a bug at one point, where we truncated the FSM and VM
+                        // in the pageserver, but the Postgres didn't know about that
+                        // and continued to generate incremental WAL records for pages
+                        // that didn't exist in the pageserver. Trying to replay those
+                        // WAL records failed to find the previous image of the page.
+                        // This special case allows us to recover from that situation.
+                        // See https://github.com/neondatabase/neon/issues/2601.
+                        //
+                        // Unfortunately we cannot do this for the main fork, or for
+                        // any metadata keys, keys, as that would lead to actual data
+                        // loss.
+                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
+                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
+                            ZERO_PAGE.clone()
+                        } else {
+                            return Err(err);
+                        }
+                    }
+                };
+                image_layer_writer.put_image(key, img, ctx).await?;
+                key = key.next();
+            }
        }
+        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
+
+        self.new_images.push(image_layer);

        timer.stop_and_record();

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
+use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
@@ -41,27 +41,6 @@ pub(crate) enum Error {
    Unexpected(#[source] anyhow::Error),
 }

-impl From<Error> for ApiError {
-    fn from(value: Error) -> Self {
-        match value {
-            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
-            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
-            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
-            Error::ShuttingDown => ApiError::ShuttingDown,
-            Error::OtherTimelineDetachOngoing(_) => {
-                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
-            }
-            // All of these contain shutdown errors, in fact, it's the most common
-            e @ Error::FlushAncestor(_)
-            | e @ Error::RewrittenDeltaDownloadFailed(_)
-            | e @ Error::CopyDeltaPrefix(_)
-            | e @ Error::UploadRewritten(_)
-            | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
-        }
-    }
-}
-
 pub(crate) struct PreparedTimelineDetach {
    layers: Vec<Layer>,
 }
@@ -96,11 +75,6 @@ pub(super) async fn prepare(
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
-        // TODO: check if we have already been detached; for this we need to read the stored data
-        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
-        // a projection of the commited data.
-        //
-        // the error is wrong per openapi
        return Err(NoAncestor);
    };

@@ -110,7 +84,7 @@ pub(super) async fn prepare(

    if ancestor.ancestor_timeline.is_some() {
        // non-technical requirement; we could flatten N ancestors just as easily but we chose
-        // not to, at least initially
+        // not to
        return Err(TooManyAncestors);
    }

--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -7,20 +7,19 @@ use crate::{
            index::{IndexPart, LayerFileMetadata},
        },
        storage_layer::LayerName,
+        Generation,
    },
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use std::{
-    collections::{hash_map, HashMap},
-    str::FromStr,
-};
+use pageserver_api::shard::ShardIndex;
+use std::{collections::HashMap, str::FromStr};
 use utils::lsn::Lsn;

 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
    /// The only one we care about
-    Layer(LayerName, LocalLayerFileMetadata),
+    Layer(LayerName, Utf8PathBuf, u64),
    /// Old ephmeral files from previous launches, should be removed
    Ephemeral(String),
    /// Old temporary timeline files, unsure what these really are, should be removed
@@ -28,7 +27,7 @@ pub(super) enum Discovered {
    /// Temporary on-demand download files, should be removed
    TemporaryDownload(String),
    /// Backup file from previously future layers
-    IgnoredBackup(Utf8PathBuf),
+    IgnoredBackup,
    /// Unrecognized, warn about these
    Unknown(String),
 }
@@ -44,15 +43,12 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
        let discovered = match LayerName::from_str(&file_name) {
            Ok(file_name) => {
                let file_size = direntry.metadata()?.len();
-                Discovered::Layer(
-                    file_name,
-                    LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
-                )
+                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
            }
            Err(_) => {
                if file_name.ends_with(".old") {
                    // ignore these
-                    Discovered::IgnoredBackup(direntry.path().to_owned())
+                    Discovered::IgnoredBackup
                } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
                    Discovered::TemporaryDownload(file_name)
                } else if is_ephemeral_file(&file_name) {
@@ -75,32 +71,37 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 /// this structure extends it with metadata describing the layer's presence in local storage.
 #[derive(Clone, Debug)]
 pub(super) struct LocalLayerFileMetadata {
-    pub(super) file_size: u64,
+    pub(super) metadata: LayerFileMetadata,
    pub(super) local_path: Utf8PathBuf,
 }

 impl LocalLayerFileMetadata {
-    pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
+    pub fn new(
+        local_path: Utf8PathBuf,
+        file_size: u64,
+        generation: Generation,
+        shard: ShardIndex,
+    ) -> Self {
        Self {
            local_path,
-            file_size,
+            metadata: LayerFileMetadata::new(file_size, generation, shard),
        }
    }
 }

-/// For a layer that is present in remote metadata, this type describes how to handle
-/// it during startup: it is either Resident (and we have some metadata about a local file),
-/// or it is Evicted (and we only have remote metadata).
+/// Decision on what to do with a layer file after considering its local and remote metadata.
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
    /// The layer is not present locally.
    Evicted(LayerFileMetadata),
-    /// The layer is present locally, and metadata matches: we may hook up this layer to the
-    /// existing file in local storage.
-    Resident {
+    /// The layer is present locally, but local metadata does not match remote; we must
+    /// delete it and treat it as evicted.
+    UseRemote {
        local: LocalLayerFileMetadata,
        remote: LayerFileMetadata,
    },
+    /// The layer is present locally, and metadata matches.
+    UseLocal(LocalLayerFileMetadata),
 }

 /// A layer needs to be left out of the layer map.
@@ -116,81 +117,77 @@ pub(super) enum DismissedLayer {
    /// In order to make crash safe updates to layer map, we must dismiss layers which are only
    /// found locally or not yet included in the remote `index_part.json`.
    LocalOnly(LocalLayerFileMetadata),
-
-    /// The layer exists in remote storage but the local layer's metadata (e.g. file size)
-    /// does not match it
-    BadMetadata(LocalLayerFileMetadata),
 }

 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
+    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
+    generation: Generation,
+    shard: ShardIndex,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    let Some(index_part) = index_part else {
-        // If we have no remote metadata, no local layer files are considered valid to load
-        return local_layers
-            .into_iter()
-            .map(|(layer_name, local_metadata)| {
-                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
-            })
-            .collect();
-    };
+    use Decision::*;

-    let mut result = Vec::new();
+    // name => (local_metadata, remote_metadata)
+    type Collected =
+        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;

-    let mut remote_layers = HashMap::new();
+    let mut discovered = discovered
+        .into_iter()
+        .map(|(layer_name, local_path, file_size)| {
+            (
+                layer_name,
+                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
+                // it is not in IndexPart, in which case using our current generation makes sense
+                // because it will be uploaded in this generation.
+                (
+                    Some(LocalLayerFileMetadata::new(
+                        local_path, file_size, generation, shard,
+                    )),
+                    None,
+                ),
+            )
+        })
+        .collect::<Collected>();

-    // Construct Decisions for layers that are found locally, if they're in remote metadata.  Otherwise
-    // construct DismissedLayers to get rid of them.
-    for (layer_name, local_metadata) in local_layers {
-        let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
-            result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
-            continue;
-        };
-
-        if remote_metadata.file_size != local_metadata.file_size {
-            result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
-            continue;
-        }
-
-        remote_layers.insert(
-            layer_name,
-            Decision::Resident {
-                local: local_metadata,
-                remote: remote_metadata.clone(),
-            },
-        );
-    }
-
-    // Construct Decision for layers that were not found locally
+    // merge any index_part information, when available
    index_part
-        .layer_metadata
-        .iter()
+        .as_ref()
+        .map(|ip| ip.layer_metadata.iter())
+        .into_iter()
+        .flatten()
+        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
        .for_each(|(name, metadata)| {
-            if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
-                entry.insert(Decision::Evicted(metadata.clone()));
+            if let Some(existing) = discovered.get_mut(name) {
+                existing.1 = Some(metadata);
+            } else {
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
            }
        });

-    // For layers that were found in authoritative remote metadata, apply a final check that they are within
-    // the disk_consistent_lsn.
-    result.extend(remote_layers.into_iter().map(|(name, decision)| {
-        if name.is_in_future(disk_consistent_lsn) {
-            match decision {
-                Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
-                Decision::Resident {
-                    local,
-                    remote: _remote,
-                } => (name, Err(DismissedLayer::Future { local: Some(local) })),
-            }
-        } else {
-            (name, Ok(decision))
-        }
-    }));
+    discovered
+        .into_iter()
+        .map(|(name, (local, remote))| {
+            let decision = if name.is_in_future(disk_consistent_lsn) {
+                Err(DismissedLayer::Future { local })
+            } else {
+                match (local, remote) {
+                    (Some(local), Some(remote)) if local.metadata != remote => {
+                        Ok(UseRemote { local, remote })
+                    }
+                    (Some(x), Some(_)) => Ok(UseLocal(x)),
+                    (None, Some(x)) => Ok(Evicted(x)),
+                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
+                    (None, None) => {
+                        unreachable!("there must not be any non-local non-remote files")
+                    }
+                }
+            };

-    result
+            (name, decision)
+        })
+        .collect::<Vec<_>>()
 }

 pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -199,15 +196,25 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
    std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
 }

-pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
-    let local_size = local.file_size;
+pub(super) fn cleanup_local_file_for_remote(
+    local: &LocalLayerFileMetadata,
+    remote: &LayerFileMetadata,
+) -> anyhow::Result<()> {
+    let local_size = local.metadata.file_size();
+    let remote_size = remote.file_size();
    let path = &local.local_path;
-    let file_name = path.file_name().expect("must be file path");
-    tracing::warn!(
-        "removing local file {file_name:?} because it has unexpected length {local_size};"
-    );

-    std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
+    let file_name = path.file_name().expect("must be file path");
+    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
+        assert!(
+            path.exists(),
+            "we would leave the local_layer without a file if this does not hold: {path}",
+        );
+        Err(err)
+    } else {
+        Ok(())
+    }
 }

 pub(super) fn cleanup_future_layer(
@@ -229,8 +236,8 @@ pub(super) fn cleanup_local_only_file(
 ) -> anyhow::Result<()> {
    let kind = name.kind();
    tracing::info!(
-        "found local-only {kind} layer {name} size {}",
-        local.file_size
+        "found local-only {kind} layer {name}, metadata {:?}",
+        local.metadata
    );
    std::fs::remove_file(&local.local_path)?;
    Ok(())
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -212,34 +212,13 @@ impl LayerManager {
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
        drop_layers: &[Layer],
-        metrics: &TimelineMetrics,
+        _metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
-        for (old_layer, new_layer) in rewrite_layers {
-            debug_assert_eq!(
-                old_layer.layer_desc().key_range,
-                new_layer.layer_desc().key_range
-            );
-            debug_assert_eq!(
-                old_layer.layer_desc().lsn_range,
-                new_layer.layer_desc().lsn_range
-            );

-            // Safety: we may never rewrite the same file in-place.  Callers are responsible
-            // for ensuring that they only rewrite layers after something changes the path,
-            // such as an increment in the generation number.
-            assert_ne!(old_layer.local_path(), new_layer.local_path());
+        // TODO: implement rewrites (currently this code path only used for drops)
+        assert!(rewrite_layers.is_empty());

-            Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
-
-            Self::insert_historic_layer(
-                new_layer.as_ref().clone(),
-                &mut updates,
-                &mut self.layer_fmgr,
-            );
-
-            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
-        }
        for l in drop_layers {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -705,7 +705,6 @@ impl ConnectionManagerState {
                    commit_lsn: info.commit_lsn,
                    safekeeper_connstr: info.safekeeper_connstr,
                    availability_zone: info.availability_zone,
-                    standby_horizon: info.standby_horizon,
                }
            }
            MessageType::SafekeeperDiscoveryResponse => {
@@ -726,21 +725,6 @@ impl ConnectionManagerState {

        WALRECEIVER_BROKER_UPDATES.inc();

-        trace!(
-            "safekeeper info update: standby_horizon(cutoff)={}",
-            timeline_update.standby_horizon
-        );
-        if timeline_update.standby_horizon != 0 {
-            // ignore reports from safekeepers not connected to replicas
-            self.timeline
-                .standby_horizon
-                .store(Lsn(timeline_update.standby_horizon));
-            self.timeline
-                .metrics
-                .standby_horizon_gauge
-                .set(timeline_update.standby_horizon as i64);
-        }
-
        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
        let old_entry = self.wal_stream_candidates.insert(
            new_safekeeper_id,
@@ -1110,7 +1094,6 @@ mod tests {
                commit_lsn,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
-                standby_horizon: 0,
            },
            latest_update,
        }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -213,7 +213,10 @@ impl UploadQueue {

        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
        for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(layer_name.to_owned(), layer_metadata.clone());
+            files.insert(
+                layer_name.to_owned(),
+                LayerFileMetadata::from(layer_metadata),
+            );
        }

        info!(
@@ -319,7 +322,9 @@ impl std::fmt::Display for UploadOp {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer, metadata.file_size, metadata.generation
+                    layer,
+                    metadata.file_size(),
+                    metadata.generation
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,8 +49,9 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int         neon_protocol_version = 2;
+int         neon_protocol_version = 1;

+static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;

@@ -94,44 +95,18 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;

-typedef enum PSConnectionState {
-	PS_Disconnected,			/* no connection yet */
-	PS_Connecting_Startup,		/* connection starting up */
-	PS_Connecting_PageStream,	/* negotiating pagestream */ 
-	PS_Connected,				/* connected, pagestream established */
-} PSConnectionState;
-
 /* This backend's per-shard connections */
 typedef struct
 {
-	TimestampTz		last_connect_time; /* read-only debug value */
-	TimestampTz		last_reconnect_time;
-	uint32			delay_us;
-	int				n_reconnect_attempts;
+	PGconn	   *conn;

-	/*---
-	 * Pageserver connection state, i.e.
-	 *	disconnected: conn == NULL, wes == NULL;
-	 *	conn_startup: connection initiated, waiting for connection establishing
-	 *	conn_ps:      PageStream query sent, waiting for confirmation
-	 *	connected:    PageStream established
-	 */
-	PSConnectionState state;
-	PGconn		   *conn;
 	/*---
 	 * WaitEventSet containing:
-	 *	- WL_SOCKET_READABLE on 'conn'
-	 *	- WL_LATCH_SET on MyLatch, and
-	 *	- WL_EXIT_ON_PM_DEATH.
+	 * - WL_SOCKET_READABLE on 'conn'
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
 	 */
-	WaitEventSet   *wes_read;
-	/*---
-	 * WaitEventSet containing:
-	 *	- WL_SOCKET_WRITABLE on 'conn'
-	 *	- WL_LATCH_SET on MyLatch, and
-	 *	- WL_EXIT_ON_PM_DEATH.
-	 */
-	WaitEventSet   *wes_write;
+	WaitEventSet *wes;
 } PageServer;

 static PageServer page_servers[MAX_SHARDS];
@@ -328,269 +303,119 @@ get_shard_number(BufferTag *tag)
 	return hash % n_shards;
 }

-static inline void
-CLEANUP_AND_DISCONNECT(PageServer *shard) 
-{
-	if (shard->wes_read)
-	{
-		FreeWaitEventSet(shard->wes_read);
-		shard->wes_read = NULL;
-	}
-	if (shard->wes_write)
-	{
-		FreeWaitEventSet(shard->wes_write);
-		shard->wes_write = NULL;
-	}
-	if (shard->conn)
-	{
-		PQfinish(shard->conn);
-		shard->conn = NULL;
-	}
-
-	shard->state = PS_Disconnected;
-}
-
-/*
- * Connect to a pageserver, or continue to try to connect if we're yet to
- * complete the connection (e.g. due to receiving an earlier cancellation
- * during connection start).
- * Returns true if successfully connected; false if the connection failed.
- * 
- * Throws errors in unrecoverable situations, or when this backend's query
- * is canceled.
- */
 static bool
 pageserver_connect(shardno_t shard_no, int elevel)
 {
-	PageServer *shard = &page_servers[shard_no];
+	char	   *query;
+	int			ret;
+	const char *keywords[3];
+	const char *values[3];
+	int			n;
+	PGconn	   *conn;
+	WaitEventSet *wes;
 	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];

+	static TimestampTz last_connect_time = 0;
+	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
+	TimestampTz now;
+	uint64_t	us_since_last_connect;
+	bool	broke_from_loop = false;
+
+	Assert(page_servers[shard_no].conn == NULL);
+
 	/*
 	 * Get the connection string for this shard. If the shard map has been
 	 * updated since we last looked, this will also disconnect any existing
 	 * pageserver connections as a side effect.
-	 * Note that connstr is used both during connection start, and when we
-	 * log the successful connection.
 	 */
 	load_shard_map(shard_no, connstr, NULL);

-	switch (shard->state)
+	now = GetCurrentTimestamp();
+	us_since_last_connect = now - last_connect_time;
+	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
 	{
-	case PS_Disconnected:
-	{
-		const char *keywords[3];
-		const char *values[3];
-		int			n_pgsql_params;
-		TimestampTz	now;
-		int64		us_since_last_attempt;
-
-		/* Make sure we start with a clean slate */
-		CLEANUP_AND_DISCONNECT(shard);
-
-		neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
-
-		now = GetCurrentTimestamp();
-		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
-		shard->last_reconnect_time = now;
-
-		/*
-		 * If we did other tasks between reconnect attempts, then we won't
-		 * need to wait as long as a full delay.
-		 */
-		if (us_since_last_attempt < shard->delay_us)
-		{
-			pg_usleep(shard->delay_us - us_since_last_attempt);
-		}
-
-		/* update the delay metric */
-		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
-
-		/*
-		 * Connect using the connection string we got from the
-		 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
-		 * variable was set, use that as the password.
-		 *
-		 * The connection options are parsed in the order they're given, so when
-		 * we set the password before the connection string, the connection string
-		 * can override the password from the env variable. Seems useful, although
-		 * we don't currently use that capability anywhere.
-		 */
-		keywords[0] = "dbname";
-		values[0] = connstr;
-		n_pgsql_params = 1;
-
-		if (neon_auth_token)
-		{
-			keywords[1] = "password";
-			values[1] = neon_auth_token;
-			n_pgsql_params++;
-		}
-
-		keywords[n_pgsql_params] = NULL;
-		values[n_pgsql_params] = NULL;
-
-		shard->conn = PQconnectStartParams(keywords, values, 1);
-		if (!shard->conn)
-		{
-			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
-			return false;
-		}
-
-		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
-
-		shard->wes_write = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_write, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_write, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE,
-						  PQsocket(shard->conn),
-						  NULL, NULL);
-
-		shard->state = PS_Connecting_Startup;
-		/* fallthrough */
+		pg_usleep(delay_us);
+		delay_us *= 2;
 	}
-	case PS_Connecting_Startup:
+	else
 	{
-		char	   *pagestream_query;
-		int			ps_send_query_ret;
-		bool		connected = false;
+		delay_us = MIN_RECONNECT_INTERVAL_USEC;
+	}

-		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
+	/*
+	 * Connect using the connection string we got from the
+	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+	 * variable was set, use that as the password.
+	 *
+	 * The connection options are parsed in the order they're given, so when
+	 * we set the password before the connection string, the connection string
+	 * can override the password from the env variable. Seems useful, although
+	 * we don't currently use that capability anywhere.
+	 */
+	n = 0;
+	if (neon_auth_token)
+	{
+		keywords[n] = "password";
+		values[n] = neon_auth_token;
+		n++;
+	}
+	keywords[n] = "dbname";
+	values[n] = connstr;
+	n++;
+	keywords[n] = NULL;
+	values[n] = NULL;
+	n++;
+	conn = PQconnectdbParams(keywords, values, 1);
+	last_connect_time = GetCurrentTimestamp();

-		do
-		{
-			WaitEvent	event;
-			int			poll_result = PQconnectPoll(shard->conn);
+	if (PQstatus(conn) == CONNECTION_BAD)
+	{
+		char	   *msg = pchomp(PQerrorMessage(conn));

-			switch (poll_result)
-			{
-			default: /* unknown/unused states are handled as a failed connection */
-			case PGRES_POLLING_FAILED:
-				{
-					char	   *pqerr = PQerrorMessage(shard->conn);
-					char	   *msg = NULL;
-					neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
+		PQfinish(conn);

-					if (pqerr)
-						msg = pchomp(pqerr);
-
-					CLEANUP_AND_DISCONNECT(shard);
-
-					if (msg)
-					{
-						neon_shard_log(shard_no, elevel,
-									   "could not connect to pageserver: %s",
-									   msg);
-						pfree(msg);
-					}
-					else
-						neon_shard_log(shard_no, elevel,
-									   "could not connect to pageserver");
-
-					return false;
-				}
-			case PGRES_POLLING_READING:
-				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
-				/* PQconnectPoll() handles the socket polling state updates */
-
-				break;
-			case PGRES_POLLING_WRITING:
-				/* Sleep until there's something to do */
-				(void) WaitEventSetWait(shard->wes_write, -1L, &event, 1,
-										PG_WAIT_EXTENSION);
-				ResetLatch(MyLatch);
-
-				/* query cancellation, backend shutdown */
-				CHECK_FOR_INTERRUPTS();
-
-				/* PQconnectPoll() handles the socket polling state updates */
-
-				break;
-			case PGRES_POLLING_OK:
-				neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
-				connected = true;
-				break;
-			}
-		}
-		while (!connected);
-
-		/* No more polling needed; connection succeeded */
-		shard->last_connect_time = GetCurrentTimestamp();
-
-		switch (neon_protocol_version)
-		{
+		ereport(elevel,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+				 errdetail_internal("%s", msg)));
+		pfree(msg);
+		return false;
+	}
+	switch (neon_protocol_version)
+	{
 		case 2:
-			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
 		case 1:
-			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-		}
-
-		if (PQstatus(shard->conn) == CONNECTION_BAD)
-		{
-			char	   *msg = pchomp(PQerrorMessage(shard->conn));
-
-			CLEANUP_AND_DISCONNECT(shard);
-
-			ereport(elevel,
-					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-						errdetail_internal("%s", msg)));
-			pfree(msg);
-			return false;
-		}
-
-		ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
-		pfree(pagestream_query);
-		if (ps_send_query_ret != 1)
-		{
-			CLEANUP_AND_DISCONNECT(shard);
-
-			neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
-			return false;
-		}
-
-		shard->state = PS_Connecting_PageStream;
-		/* fallthrough */
 	}
-	case PS_Connecting_PageStream:
+	ret = PQsendQuery(conn, query);
+	pfree(query);
+	if (ret != 1)
 	{
-		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
+		PQfinish(conn);
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
+		return false;
+	}

-		if (PQstatus(shard->conn) == CONNECTION_BAD)
-		{
-			char	   *msg = pchomp(PQerrorMessage(shard->conn));
-			CLEANUP_AND_DISCONNECT(shard);
-			ereport(elevel,
-					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-						errdetail_internal("%s", msg)));
-			pfree(msg);
-			return false;
-		}
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);

-		while (PQisBusy(shard->conn))
+	PG_TRY();
+	{
+		while (PQisBusy(conn))
 		{
 			WaitEvent	event;

 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
+			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 			ResetLatch(MyLatch);

 			CHECK_FOR_INTERRUPTS();
@@ -598,37 +423,40 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			/* Data available in socket? */
 			if (event.events & WL_SOCKET_READABLE)
 			{
-				if (!PQconsumeInput(shard->conn))
+				if (!PQconsumeInput(conn))
 				{
-					char	   *msg = pchomp(PQerrorMessage(shard->conn));
+					char	   *msg = pchomp(PQerrorMessage(conn));
+
+					PQfinish(conn);
+					FreeWaitEventSet(wes);

-					CLEANUP_AND_DISCONNECT(shard);
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					pfree(msg);
-					return false;
+					/* Returning from inside PG_TRY is bad, so we break/return later */
+					broke_from_loop = true;
+					break;
 				}
 			}
 		}
-
-		shard->state = PS_Connected;
-		/* fallthrough */
 	}
-	case PS_Connected:
-		/*
-		 * We successfully connected. Future connections to this PageServer
-		 * will do fast retries again, with exponential backoff.
-		 */
-		shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
-
-		neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
-		neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
-		return true;
-	default:
-		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
+	PG_CATCH();
+	{
+		PQfinish(conn);
+		FreeWaitEventSet(wes);
+		PG_RE_THROW();
 	}
-	/* This shouldn't be hit */
-	Assert(false);
+	PG_END_TRY();
+
+	if (broke_from_loop)
+	{
+		return false;
+	}
+
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;
+
+	return true;
 }

 /*
@@ -648,7 +476,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -674,8 +502,7 @@ retry:

 /*
 * Reset prefetch and drop connection to the shard.
- * It also drops connection to all other shards involved in prefetch, through
- * prefetch_on_ps_disconnect().
+ * It also drops connection to all other shards involved in prefetch.
 */
 static void
 pageserver_disconnect(shardno_t shard_no)
@@ -685,6 +512,9 @@ pageserver_disconnect(shardno_t shard_no)
 	 * whole prefetch queue, even for other pageservers. It should not
 	 * cause big problems, because connection loss is supposed to be a
 	 * rare event.
+	 *
+	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
+	 * because prefetch request may be registered before connection is established.
 	 */
 	prefetch_on_ps_disconnect();

@@ -697,36 +527,37 @@ pageserver_disconnect(shardno_t shard_no)
 static void
 pageserver_disconnect_shard(shardno_t shard_no)
 {
-	PageServer *shard = &page_servers[shard_no];
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
 	 * what state the connection is in. For example, if we sent the request
 	 * but didn't receive a response yet, we might receive the response some
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
-	 * Similarly, even when we're in PS_DISCONNECTED, we may have junk to
-	 * clean up: It is possible that we encountered an error allocating any
-	 * of the wait event sets or the psql connection, or failed when we tried
-	 * to attach wait events to the WaitEventSets.
 	 */
-	CLEANUP_AND_DISCONNECT(shard);
-
-	shard->state = PS_Disconnected;
+	if (page_servers[shard_no].conn)
+	{
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;
+	}
+	if (page_servers[shard_no].wes != NULL)
+	{
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
+	}
 }

 static bool
 pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-	PageServer *shard = &page_servers[shard_no];
-	PGconn	   *pageserver_conn;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
 		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect(shard_no);
-		pageserver_conn = NULL;
 	}

 	req_buff = nm_pack_request(request);
@@ -740,19 +571,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (shard->state != PS_Connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
-			shard->n_reconnect_attempts += 1;
+			n_reconnect_attempts += 1;
 		}
-		shard->n_reconnect_attempts = 0;
-	} else {
-		Assert(shard->conn != NULL);
+		n_reconnect_attempts = 0;
 	}

-	pageserver_conn = shard->conn;
+	pageserver_conn = page_servers[shard_no].conn;

 	/*
 	 * Send request.
@@ -761,17 +590,13 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * should use async mode and check for interrupts while waiting. In
 	 * practice, our requests are small enough to always fit in the output and
 	 * TCP buffer.
-	 *
-	 * Note that this also will fail when the connection is in the
-	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
-	 * point, but on the grand scheme of things it's only a small issue.
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

 		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -786,7 +611,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
-
 	return true;
 }

@@ -795,68 +619,58 @@ pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-	PageServer *shard = &page_servers[shard_no];
-	PGconn	   *pageserver_conn = shard->conn;
-	/* read response */
-	int			rc;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (shard->state != PS_Connected)
-	{
-		neon_shard_log(shard_no, LOG,
-					   "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
-					   shard->state);
+	if (!pageserver_conn)
 		return NULL;
-	}

-	Assert(pageserver_conn);
-
-	rc = call_PQgetCopyData(shard_no, &resp_buff.data);
-	if (rc >= 0)
+	PG_TRY();
 	{
-		/* call_PQgetCopyData handles rc == 0 */
-		Assert(rc > 0);
+		/* read response */
+		int			rc;

-		PG_TRY();
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
+		if (rc >= 0)
 		{
 			resp_buff.len = rc;
 			resp_buff.cursor = 0;
 			resp = nm_unpack_response(&resp_buff);
 			PQfreemem(resp_buff.data);
+
+			if (message_level_is_interesting(PageStoreTrace))
+			{
+				char	   *msg = nm_to_string((NeonMessage *) resp);
+
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+				pfree(msg);
+			}
 		}
-		PG_CATCH();
+		else if (rc == -1)
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
 			pageserver_disconnect(shard_no);
-			PG_RE_THROW();
+			resp = NULL;
 		}
-		PG_END_TRY();
-
-		if (message_level_is_interesting(PageStoreTrace))
+		else if (rc == -2)
 		{
-			char	   *msg = nm_to_string((NeonMessage *) resp);
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
-			pfree(msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+		}
+		else
+		{
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
-	else if (rc == -1)
+	PG_CATCH();
 	{
-		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
 		pageserver_disconnect(shard_no);
-		resp = NULL;
-	}
-	else if (rc == -2)
-	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
-		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
-	}
-	else
-	{
-		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
+		PG_RE_THROW();
 	}
+	PG_END_TRY();

 	return (NeonResponse *) resp;
 }
@@ -867,7 +681,7 @@ pageserver_flush(shardno_t shard_no)
 {
 	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (page_servers[shard_no].state != PS_Connected)
+	if (!pageserver_conn)
 	{
 		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
@@ -883,7 +697,6 @@ pageserver_flush(shardno_t shard_no)
 			return false;
 		}
 	}
-
 	return true;
 }

@@ -1047,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2, /* use protocol version 2 */
+							1, /* default to old protocol for now */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,
@@ -1078,7 +891,5 @@ pg_init_libpagestore(void)
 		dbsize_hook = neon_dbsize;
 	}

-	memset(page_servers, 0, sizeof(page_servers));
-
 	lfc_init();
 }
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -94,10 +94,6 @@ static char *hexdump_page(char *page);

 const int	SmgrTrace = DEBUG5;

-#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
-	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
-				   ##__VA_ARGS__)
-
 page_server_api *page_server;

 /* unlogged relation build states */
@@ -530,8 +526,6 @@ prefetch_flush_requests(void)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
- * NOTE: callers should make sure they can handle query cancellations in this
- * function's call path.
 */
 static bool
 prefetch_wait_for(uint64 ring_index)
@@ -567,8 +561,6 @@ prefetch_wait_for(uint64 ring_index)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
- *
- * NOTE: this does IO, and can get canceled out-of-line.
 */
 static bool
 prefetch_read(PrefetchRequest *slot)
@@ -580,14 +572,6 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);

-	if (slot->status != PRFS_REQUESTED ||
-		slot->response != NULL ||
-		slot->my_ring_index != MyPState->ring_receive)
-		neon_shard_log(slot->shard_no, ERROR,
-					   "Incorrect prefetch read: status=%d response=%llx my=%llu receive=%llu",
-					   slot->status, (size_t) (void *) slot->response,
-					   slot->my_ring_index, MyPState->ring_receive);
-
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
@@ -605,11 +589,6 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, WARNING,
-					   "No response from reading prefetch entry %llu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   slot->my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
-					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;
 	}
 }
@@ -624,7 +603,6 @@ void
 prefetch_on_ps_disconnect(void)
 {
 	MyPState->ring_flush = MyPState->ring_unused;
-
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -647,7 +625,6 @@ prefetch_on_ps_disconnect(void)
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
-
 		prefetch_set_unused(ring_index);
 	}
 }
@@ -714,8 +691,6 @@ static void
 prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
 {
 	bool		found;
-	uint64		mySlotNo = slot->my_ring_index;
-
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		/* lsn and not_modified_since are filled in below */
@@ -724,8 +699,6 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 		.blkno = slot->buftag.blockNum,
 	};

-	Assert(mySlotNo == MyPState->ring_unused);
-
 	if (force_request_lsns)
 		slot->request_lsns = *force_request_lsns;
 	else
@@ -738,11 +711,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
-	{
-		Assert(mySlotNo == MyPState->ring_unused);
-		/* loop */
-	}
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -753,6 +722,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
+
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -924,10 +894,6 @@ Retry:
 	return ring_index;
 }

-/*
- * Note: this function can get canceled and use a long jump to the next catch
- * context. Take care.
- */
 static NeonResponse *
 page_server_request(void const *req)
 {
@@ -959,38 +925,19 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
-		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
 	{
 		shard_no = 0;
 	}

 	do
 	{
-		PG_TRY();
-		{
-			while (!page_server->send(shard_no, (NeonRequest *) req)
-				   || !page_server->flush(shard_no))
-			{
-				/* do nothing */
-			}
-			consume_prefetch_responses();
-			resp = page_server->receive(shard_no);
-		}
-		PG_CATCH();
-		{
-			/*
-			 * Cancellation in this code needs to be handled better at some
-			 * point, but this currently seems fine for now.
-			 */
-			page_server->disconnect(shard_no);
-			PG_RE_THROW();
-		}
-		PG_END_TRY();
-
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
+		consume_prefetch_responses();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
-
 	return resp;
+
 }


@@ -1402,10 +1349,6 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }

-/*
- * A page is being evicted from the shared buffer cache. Update the
- * last-written LSN of the page, and WAL-log it if needed.
- */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1414,7 +1357,12 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
-	bool		log_page;
+
+	if (ShutdownRequestPending)
+		return;
+	/* Don't log any pages if we're not allowed to do so. */
+	if (!XLogInsertAllowed())
+		return;

 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1423,21 +1371,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	log_page = false;
-	if (force)
-	{
-		Assert(XLogInsertAllowed());
-		log_page = true;
-	}
-	else if (XLogInsertAllowed() &&
-			 !ShutdownRequestPending &&
-			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
-	{
-		log_page = true;
-	}
-
-	if (log_page)
+	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
 	{
+		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;

 		recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1450,8 +1386,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-
-	if (lsn == InvalidXLogRecPtr)
+	else if (lsn == InvalidXLogRecPtr)
 	{
 		/*
 		 * When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1487,31 +1422,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
+		else
 		{
-			/*
-			 * Its a bad sign if there is a page with zero LSN in the buffer
-			 * cache in a standby, too. However, PANICing seems like a cure
-			 * worse than the disease, as the damage has likely already been
-			 * done in the primary. So in a standby, make this an assertion,
-			 * and in a release build just LOG the error and soldier on. We
-			 * update the last-written LSN of the page with a conservative
-			 * value in that case, which is the last replayed LSN.
-			 */
-			ereport(RecoveryInProgress() ? LOG : PANIC,
+			ereport(PANIC,
 					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
-			Assert(false);
-
-			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
+				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1604,92 +1527,8 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)

 	if (RecoveryInProgress())
 	{
-		/*---
-		 * In broad strokes, a replica always requests the page at the current
-		 * replay LSN. But looking closer, what exactly is the replay LSN? Is
-		 * it the last replayed record, or the record being replayed? And does
-		 * the startup process performing the replay need to do something
-		 * differently than backends running queries? Let's take a closer look
-		 * at the different scenarios:
-		 *
-		 * 1. Startup process reads a page, last_written_lsn is old.
-		 *
-		 * Read the old version of the page. We will apply the WAL record on
-		 * it to bring it up-to-date.
-		 *
-		 * We could read the new version, with the changes from this WAL
-		 * record already applied, to offload the work of replaying the record
-		 * to the pageserver. The pageserver might not have received the WAL
-		 * record yet, though, so a read of the old page version and applying
-		 * the record ourselves is likely faster. Also, the redo function
-		 * might be surprised if the changes have already applied. That's
-		 * normal during crash recovery, but not in hot standby.
-		 *
-		 * 2. Startup process reads a page, last_written_lsn == record we're
-		 *    replaying.
-		 *
-		 * Can this happen? There are a few theoretical cases when it might:
-		 *
-		 * A) The redo function reads the same page twice. We had already read
-		 *    and applied the changes once, and now we're reading it for the
-		 *    second time.  That would be a rather silly thing for a redo
-		 *    function to do, and I'm not aware of any that would do it.
-		 *
-		 * B) The redo function modifies multiple pages, and it already
-		 *    applied the changes to one of the pages, released the lock on
-		 *    it, and is now reading a second page.  Furthermore, the first
-		 *    page was already evicted from the buffer cache, and also from
-		 *    the last-written LSN cache, so that the per-relation or global
-		 *    last-written LSN was already updated. All the WAL redo functions
-		 *    hold the locks on pages that they modify, until all the changes
-		 *    have been modified (?), which would make that impossible.
-		 *    However, we skip the locking, if the page isn't currently in the
-		 *    page cache (see neon_redo_read_buffer_filter below).
-		 *
-		 * Even if the one of the above cases were possible in theory, they
-		 * would also require the pages being modified by the redo function to
-		 * be immediately evicted from the page cache.
-		 *
-		 * So this probably does not happen in practice. But if it does, we
-		 * request the new version, including the changes from the record
-		 * being replayed. That seems like the correct behavior in any case.
-		 *
-		 * 3. Backend process reads a page with old last-written LSN
-		 *
-		 * Nothing special here. Read the old version.
-		 *
-		 * 4. Backend process reads a page with last_written_lsn == record being replayed
-		 *
-		 * This can happen, if the redo function has started to run, and saw
-		 * that the page isn't present in the page cache (see
-		 * neon_redo_read_buffer_filter below).  Normally, in a normal
-		 * Postgres server, the redo function would hold a lock on the page,
-		 * so we would get blocked waiting the redo function to release the
-		 * lock. To emulate that, wait for the WAL replay of the record to
-		 * finish.
-		 */
-		/* Request the page at the end of the last fully replayed LSN. */
-		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
-
-		if (last_written_lsn > replay_lsn)
-		{
-			/* GetCurrentReplayRecPtr was introduced in v15 */
-#if PG_VERSION_NUM >= 150000
-			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
-#endif
-
-			/*
-			 * Cases 2 and 4. If this is a backend (case 4), the
-			 * neon_read_at_lsn() call later will wait for the WAL record to be
-			 * fully replayed.
-			 */
-			result.request_lsn = last_written_lsn;
-		}
-		else
-		{
-			/* cases 1 and 3 */
-			result.request_lsn = replay_lsn;
-		}
+		/* Request the page at the last replayed LSN. */
+		result.request_lsn = GetXLogReplayRecPtr(NULL);
 		result.not_modified_since = last_written_lsn;
 		result.effective_request_lsn = result.request_lsn;
 		Assert(last_written_lsn <= result.request_lsn);
@@ -1958,9 +1797,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
-										T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2412,7 +2249,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
-Retry:
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);

 	if (entry != NULL)
@@ -2498,9 +2335,7 @@ Retry:
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2771,9 +2606,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
-										T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

@@ -2826,9 +2659,7 @@ neon_dbsize(Oid dbNode)
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
-										T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
 	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
@@ -3167,9 +2998,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
-										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
 	}
 	pfree(resp);

@@ -3387,7 +3216,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
-	int			buf_id;
+	Buffer		buffer;
 	bool		no_redo_needed;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3425,20 +3254,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/* Try to find the relevant buffer */
-		buf_id = BufTableLookup(&tag, hash);
+		buffer = BufTableLookup(&tag, hash);

-		no_redo_needed = buf_id < 0;
+		no_redo_needed = buffer < 0;
 	}
+	/* In both cases st lwlsn past this WAL record */
+	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
 	 * evict page from file cache
 	 */
 	if (no_redo_needed)
-	{
-		SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 		lfc_evict(rinfo, forknum, blkno);
-	}
+

 	LWLockRelease(partitionLock);

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1852,30 +1852,34 @@ static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 {
 	hs->ts = 0;
-	hs->xmin = InvalidFullTransactionId;
-	hs->catalog_xmin = InvalidFullTransactionId;
+	hs->xmin.value = ~0;		/* largest unsigned value */
+	hs->catalog_xmin.value = ~0;	/* largest unsigned value */

 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
-
-		if (wp->safekeeper[i].state == SS_ACTIVE)
+		if (wp->safekeeper[i].appendResponse.hs.ts != 0)
 		{
 			HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;

 			if (FullTransactionIdIsNormal(skhs->xmin)
-				&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
+				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
 			{
 				hs->xmin = skhs->xmin;
 				hs->ts = skhs->ts;
 			}
 			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
-				&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
+				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
 			{
 				hs->catalog_xmin = skhs->catalog_xmin;
 				hs->ts = skhs->ts;
 			}
 		}
 	}
+
+	if (hs->xmin.value == ~0)
+		hs->xmin = InvalidFullTransactionId;
+	if (hs->catalog_xmin.value == ~0)
+		hs->catalog_xmin = InvalidFullTransactionId;
 }

 /*
@@ -1942,28 +1946,14 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	}

 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
-		FullTransactionId xmin = hsFeedback.xmin;
-		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
-		FullTransactionId next_xid = ReadNextFullTransactionId();
-		/*
-		 * Page server is updating nextXid in checkpoint each 1024 transactions,
-		 * so feedback xmin can be actually larger then nextXid and
-		 * function TransactionIdInRecentPast return false in this case,
-		 * preventing update of slot's xmin.
-		 */
-		if (FullTransactionIdPrecedes(next_xid, xmin))
-			xmin = next_xid;
-		if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
-			catalog_xmin = next_xid;
 		agg_hs_feedback = hsFeedback;
-		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(xmin),
-								 EpochFromFullTransactionId(xmin),
-								 XidFromFullTransactionId(catalog_xmin),
-								 EpochFromFullTransactionId(catalog_xmin));
+								 XidFromFullTransactionId(hsFeedback.xmin),
+								 EpochFromFullTransactionId(hsFeedback.xmin),
+								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}

 	CheckGracefulShutdown(wp);
--- a/poetry.lock
+++ b/poetry.lock
@@ -2405,7 +2405,6 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2530,13 +2529,13 @@ files = [

 [[package]]
 name = "requests"
-version = "2.32.0"
+version = "2.31.0"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
-    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
+    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
+    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
 ]

 [package.dependencies]
@@ -2960,16 +2959,6 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3207,4 +3196,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
+content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -9,7 +9,6 @@ default = []
 testing = []

 [dependencies]
-ahash.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
@@ -25,7 +24,6 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
-crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
 framed-websockets.workspace = true
@@ -54,6 +52,7 @@ opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -82,7 +81,6 @@ thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -97,6 +95,8 @@ utils.workspace = true
 uuid.workspace = true
 webpki-roots.workspace = true
 x509-parser.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true

@@ -106,7 +106,6 @@ workspace_hack.workspace = true
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 tokio-tungstenite.workspace = true
-pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -365,10 +365,7 @@ async fn authenticate_with_secret(
    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
    if let Some(password) = unauthenticated_password {
-        let ep = EndpointIdInt::from(&info.endpoint);
-
-        let auth_outcome =
-            validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?;
+        let auth_outcome = validate_password_and_exchange(&password, secret).await?;
        let keys = match auth_outcome {
            crate::sasl::Outcome::Success(key) => key,
            crate::sasl::Outcome::Failure(reason) => {
@@ -389,7 +386,7 @@ async fn authenticate_with_secret(
    // Currently, we use it for websocket connections (latency).
    if allow_cleartext {
        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-        return hacks::authenticate_cleartext(ctx, info, client, secret, config).await;
+        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
@@ -557,7 +554,7 @@ mod tests {
        context::RequestMonitoring,
        proxy::NeonOptions,
        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::{threadpool::ThreadPool, ServerSecret},
+        scram::ServerSecret,
        stream::{PqStream, Stream},
    };

@@ -599,7 +596,6 @@ mod tests {
    }

    static CONFIG: Lazy<AuthenticationConfig> = Lazy::new(|| AuthenticationConfig {
-        thread_pool: ThreadPool::new(1),
        scram_protocol_timeout: std::time::Duration::from_secs(5),
        rate_limiter_enabled: true,
        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -3,10 +3,8 @@ use super::{
 };
 use crate::{
    auth::{self, AuthFlow},
-    config::AuthenticationConfig,
    console::AuthSecret,
    context::RequestMonitoring,
-    intern::EndpointIdInt,
    sasl,
    stream::{self, Stream},
 };
@@ -22,7 +20,6 @@ pub async fn authenticate_cleartext(
    info: ComputeUserInfo,
    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    secret: AuthSecret,
-    config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
    warn!("cleartext auth flow override is enabled, proceeding");
    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
@@ -30,14 +27,8 @@ pub async fn authenticate_cleartext(
    // pause the timer while we communicate with the client
    let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);

-    let ep = EndpointIdInt::from(&info.endpoint);
-
    let auth_flow = AuthFlow::new(client)
-        .begin(auth::CleartextPassword {
-            secret,
-            endpoint: ep,
-            pool: config.thread_pool.clone(),
-        })
+        .begin(auth::CleartextPassword(secret))
        .await?;
    drop(paused);
    // cleartext auth is only allowed to the ws/http protocol.
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -5,14 +5,12 @@ use crate::{
    config::TlsServerEndPoint,
    console::AuthSecret,
    context::RequestMonitoring,
-    intern::EndpointIdInt,
-    sasl,
-    scram::{self, threadpool::ThreadPool},
+    sasl, scram,
    stream::{PqStream, Stream},
 };
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
-use std::{io, sync::Arc};
+use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;

@@ -55,11 +53,7 @@ impl AuthMethod for PasswordHack {

 /// Use clear-text password auth called `password` in docs
 /// <https://www.postgresql.org/docs/current/auth-password.html>
-pub struct CleartextPassword {
-    pub pool: Arc<ThreadPool>,
-    pub endpoint: EndpointIdInt,
-    pub secret: AuthSecret,
-}
+pub struct CleartextPassword(pub AuthSecret);

 impl AuthMethod for CleartextPassword {
    #[inline(always)]
@@ -132,13 +126,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
            .strip_suffix(&[0])
            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;

-        let outcome = validate_password_and_exchange(
-            &self.state.pool,
-            self.state.endpoint,
-            password,
-            self.state.secret,
-        )
-        .await?;
+        let outcome = validate_password_and_exchange(password, self.state.0).await?;

        if let sasl::Outcome::Success(_) = &outcome {
            self.stream.write_message_noflush(&Be::AuthenticationOk)?;
@@ -193,8 +181,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
 }

 pub(crate) async fn validate_password_and_exchange(
-    pool: &ThreadPool,
-    endpoint: EndpointIdInt,
    password: &[u8],
    secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
@@ -208,7 +194,7 @@ pub(crate) async fn validate_password_and_exchange(
        }
        // perform scram authentication as both client and server to validate the keys
        AuthSecret::Scram(scram_secret) => {
-            let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?;
+            let outcome = crate::scram::exchange(&scram_secret, password).await?;

            let client_key = match outcome {
                sasl::Outcome::Success(client_key) => client_key,
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -27,7 +27,6 @@ use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
 use proxy::redis::notifications;
-use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
@@ -133,9 +132,6 @@ struct ProxyCliArgs {
    /// timeout for scram authentication protocol
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    scram_protocol_timeout: tokio::time::Duration,
-    /// size of the threadpool for password hashing
-    #[clap(long, default_value_t = 4)]
-    scram_thread_pool_size: u8,
    /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
@@ -493,9 +489,6 @@ async fn main() -> anyhow::Result<()> {

 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
-    let thread_pool = ThreadPool::new(args.scram_thread_pool_size);
-    Metrics::install(thread_pool.metrics.clone());
-
    let tls_config = match (&args.tls_key, &args.tls_cert) {
        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(
            key_path,
@@ -631,7 +624,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold,
    };
    let authentication_config = AuthenticationConfig {
-        thread_pool,
        scram_protocol_timeout: args.scram_protocol_timeout,
        rate_limiter_enabled: args.auth_rate_limit_enabled,
        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -11,12 +11,10 @@ use crate::{
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
-use std::{io, net::SocketAddr, sync::Arc, time::Duration};
+use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{error, info, warn};

 const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -32,7 +30,7 @@ pub enum ConnectionError {
    CouldNotConnect(#[from] io::Error),

    #[error("{COULD_NOT_CONNECT}: {0}")]
-    TlsError(#[from] InvalidDnsNameError),
+    TlsError(#[from] native_tls::Error),

    #[error("{COULD_NOT_CONNECT}: {0}")]
    WakeComputeError(#[from] WakeComputeError),
@@ -259,7 +257,7 @@ pub struct PostgresConnection {
    /// Socket connected to a compute node.
    pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
        tokio::net::TcpStream,
-        tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
+        postgres_native_tls::TlsStream<tokio::net::TcpStream>,
    >,
    /// PostgreSQL connection parameters.
    pub params: std::collections::HashMap<String, String>,
@@ -284,24 +282,12 @@ impl ConnCfg {
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
        drop(pause);

-        let client_config = if allow_self_signed_compute {
-            let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
-            rustls::ClientConfig::builder()
-                .dangerous()
-                .with_custom_certificate_verifier(verifier)
-        } else {
-            let root_store = rustls::RootCertStore {
-                roots: webpki_roots::TLS_SERVER_ROOTS.to_vec(),
-            };
-            rustls::ClientConfig::builder().with_root_certificates(root_store)
-        };
-        let client_config = client_config.with_no_client_auth();
-
-        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
-        let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
-            &mut mk_tls,
-            host,
-        )?;
+        let tls_connector = native_tls::TlsConnector::builder()
+            .danger_accept_invalid_certs(allow_self_signed_compute)
+            .build()
+            .unwrap();
+        let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
+        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;

        // connect_raw() will not use TLS if sslmode is "disable"
        let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
@@ -354,50 +340,6 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    Some(options)
 }

-#[derive(Debug)]
-struct AcceptEverythingVerifier;
-impl ServerCertVerifier for AcceptEverythingVerifier {
-    fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
-        use rustls::SignatureScheme::*;
-        // The schemes for which `SignatureScheme::supported_in_tls13` returns true.
-        vec![
-            ECDSA_NISTP521_SHA512,
-            ECDSA_NISTP384_SHA384,
-            ECDSA_NISTP256_SHA256,
-            RSA_PSS_SHA512,
-            RSA_PSS_SHA384,
-            RSA_PSS_SHA256,
-            ED25519,
-        ]
-    }
-    fn verify_server_cert(
-        &self,
-        _end_entity: &rustls::pki_types::CertificateDer<'_>,
-        _intermediates: &[rustls::pki_types::CertificateDer<'_>],
-        _server_name: &rustls::pki_types::ServerName<'_>,
-        _ocsp_response: &[u8],
-        _now: rustls::pki_types::UnixTime,
-    ) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
-        Ok(rustls::client::danger::ServerCertVerified::assertion())
-    }
-    fn verify_tls12_signature(
-        &self,
-        _message: &[u8],
-        _cert: &rustls::pki_types::CertificateDer<'_>,
-        _dss: &rustls::DigitallySignedStruct,
-    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
-        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
-    }
-    fn verify_tls13_signature(
-        &self,
-        _message: &[u8],
-        _cert: &rustls::pki_types::CertificateDer<'_>,
-        _dss: &rustls::DigitallySignedStruct,
-    ) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
-        Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -2,7 +2,6 @@ use crate::{
    auth::{self, backend::AuthRateLimiter},
    console::locks::ApiLocks,
    rate_limiter::RateBucketInfo,
-    scram::threadpool::ThreadPool,
    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
    Host,
 };
@@ -62,7 +61,6 @@ pub struct HttpConfig {
 }

 pub struct AuthenticationConfig {
-    pub thread_pool: Arc<ThreadPool>,
    pub scram_protocol_timeout: tokio::time::Duration,
    pub rate_limiter_enabled: bool,
    pub rate_limiter: AuthRateLimiter,
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -355,7 +355,7 @@ async fn upload_parquet(
        "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
    ))?;
    let cancel = CancellationToken::new();
-    let maybe_err = backoff::retry(
+    backoff::retry(
        || async {
            let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
            storage
@@ -372,12 +372,7 @@ async fn upload_parquet(
    .await
    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
    .and_then(|x| x)
-    .context("request_data_upload")
-    .err();
-
-    if let Some(err) = maybe_err {
-        tracing::warn!(%id, %err, "failed to upload request data");
-    }
+    .context("request_data_upload")?;

    Ok(buffer.writer())
 }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,11 +1,11 @@
-use std::sync::{Arc, OnceLock};
+use std::sync::OnceLock;

 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::StaticLabelSet,
    metric::{histogram::Thresholds, name::MetricName},
-    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
-    LabelGroup, MetricGroup,
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};

@@ -14,36 +14,26 @@ use tokio::time::{self, Instant};
 use crate::console::messages::ColdStartInfo;

 #[derive(MetricGroup)]
-#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
 pub struct Metrics {
    #[metric(namespace = "proxy")]
-    #[metric(init = ProxyMetrics::new(thread_pool))]
    pub proxy: ProxyMetrics,

    #[metric(namespace = "wake_compute_lock")]
    pub wake_compute_lock: ApiLockMetrics,
 }

-static SELF: OnceLock<Metrics> = OnceLock::new();
 impl Metrics {
-    pub fn install(thread_pool: Arc<ThreadPoolMetrics>) {
-        SELF.set(Metrics::new(thread_pool))
-            .ok()
-            .expect("proxy metrics must not be installed more than once");
-    }
-
    pub fn get() -> &'static Self {
-        #[cfg(test)]
-        return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0))));
-
-        #[cfg(not(test))]
-        SELF.get()
-            .expect("proxy metrics must be installed by the main() function")
+        static SELF: OnceLock<Metrics> = OnceLock::new();
+        SELF.get_or_init(|| Metrics {
+            proxy: ProxyMetrics::default(),
+            wake_compute_lock: ApiLockMetrics::new(),
+        })
    }
 }

 #[derive(MetricGroup)]
-#[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
+#[metric(new())]
 pub struct ProxyMetrics {
    #[metric(flatten)]
    pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
@@ -139,10 +129,6 @@ pub struct ProxyMetrics {

    #[metric(namespace = "connect_compute_lock")]
    pub connect_compute_lock: ApiLockMetrics,
-
-    #[metric(namespace = "scram_pool")]
-    #[metric(init = thread_pool)]
-    pub scram_pool: Arc<ThreadPoolMetrics>,
 }

 #[derive(MetricGroup)]
@@ -160,6 +146,12 @@ pub struct ApiLockMetrics {
    pub semaphore_acquire_seconds: Histogram<16>,
 }

+impl Default for ProxyMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 impl Default for ApiLockMetrics {
    fn default() -> Self {
        Self::new()
@@ -561,52 +553,3 @@ pub enum RedisEventsCount {
    PasswordUpdate,
    AllowedIpsUpdate,
 }
-
-pub struct ThreadPoolWorkers(usize);
-pub struct ThreadPoolWorkerId(pub usize);
-
-impl LabelValue for ThreadPoolWorkerId {
-    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0 as i64)
-    }
-}
-
-impl LabelGroup for ThreadPoolWorkerId {
-    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
-        v.write_value(LabelName::from_str("worker"), self);
-    }
-}
-
-impl LabelSet for ThreadPoolWorkers {
-    type Value<'a> = ThreadPoolWorkerId;
-
-    fn dynamic_cardinality(&self) -> Option<usize> {
-        Some(self.0)
-    }
-
-    fn encode(&self, value: Self::Value<'_>) -> Option<usize> {
-        (value.0 < self.0).then_some(value.0)
-    }
-
-    fn decode(&self, value: usize) -> Self::Value<'_> {
-        ThreadPoolWorkerId(value)
-    }
-}
-
-impl FixedCardinalitySet for ThreadPoolWorkers {
-    fn cardinality(&self) -> usize {
-        self.0
-    }
-}
-
-#[derive(MetricGroup)]
-#[metric(new(workers: usize))]
-pub struct ThreadPoolMetrics {
-    pub injector_queue_depth: Gauge,
-    #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_queue_depth: GaugeVec<ThreadPoolWorkers>,
-    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_task_turns_total: CounterVec<ThreadPoolWorkers>,
-    #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))]
-    pub worker_task_skips_total: CounterVec<ThreadPoolWorkers>,
-}
--- a/proxy/src/scram.rs
+++ b/proxy/src/scram.rs
@@ -6,14 +6,11 @@
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/backend/libpq/auth-scram.c>
 //! * <https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf4b391f1e7393c1962841/src/interfaces/libpq/fe-auth-scram.c>

-mod countmin;
 mod exchange;
 mod key;
 mod messages;
-mod pbkdf2;
 mod secret;
 mod signature;
-pub mod threadpool;

 pub use exchange::{exchange, Exchange};
 pub use key::ScramKey;
@@ -59,13 +56,9 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {

 #[cfg(test)]
 mod tests {
-    use crate::{
-        intern::EndpointIdInt,
-        sasl::{Mechanism, Step},
-        EndpointId,
-    };
+    use crate::sasl::{Mechanism, Step};

-    use super::{threadpool::ThreadPool, Exchange, ServerSecret};
+    use super::{Exchange, ServerSecret};

    #[test]
    fn snapshot() {
@@ -119,13 +112,8 @@ mod tests {
    }

    async fn run_round_trip_test(server_password: &str, client_password: &str) {
-        let pool = ThreadPool::new(1);
-
-        let ep = EndpointId::from("foo");
-        let ep = EndpointIdInt::from(ep);
-
        let scram_secret = ServerSecret::build(server_password).await.unwrap();
-        let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes())
+        let outcome = super::exchange(&scram_secret, client_password.as_bytes())
            .await
            .unwrap();

--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -1,173 +0,0 @@
-use std::hash::Hash;
-
-/// estimator of hash jobs per second.
-/// <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>
-pub struct CountMinSketch {
-    // one for each depth
-    hashers: Vec<ahash::RandomState>,
-    width: usize,
-    depth: usize,
-    // buckets, width*depth
-    buckets: Vec<u32>,
-}
-
-impl CountMinSketch {
-    /// Given parameters (ε, δ),
-    ///   set width = ceil(e/ε)
-    ///   set depth = ceil(ln(1/δ))
-    ///
-    /// guarantees:
-    /// actual <= estimate
-    /// estimate <= actual + ε * N with probability 1 - δ
-    /// where N is the cardinality of the stream
-    pub fn with_params(epsilon: f64, delta: f64) -> Self {
-        CountMinSketch::new(
-            (std::f64::consts::E / epsilon).ceil() as usize,
-            (1.0_f64 / delta).ln().ceil() as usize,
-        )
-    }
-
-    fn new(width: usize, depth: usize) -> Self {
-        Self {
-            #[cfg(test)]
-            hashers: (0..depth)
-                .map(|i| {
-                    // digits of pi for good randomness
-                    ahash::RandomState::with_seeds(
-                        314159265358979323,
-                        84626433832795028,
-                        84197169399375105,
-                        82097494459230781 + i as u64,
-                    )
-                })
-                .collect(),
-            #[cfg(not(test))]
-            hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(),
-            width,
-            depth,
-            buckets: vec![0; width * depth],
-        }
-    }
-
-    pub fn inc_and_return<T: Hash>(&mut self, t: &T, x: u32) -> u32 {
-        let mut min = u32::MAX;
-        for row in 0..self.depth {
-            let col = (self.hashers[row].hash_one(t) as usize) % self.width;
-
-            let row = &mut self.buckets[row * self.width..][..self.width];
-            row[col] = row[col].saturating_add(x);
-            min = std::cmp::min(min, row[col]);
-        }
-        min
-    }
-
-    pub fn reset(&mut self) {
-        self.buckets.clear();
-        self.buckets.resize(self.width * self.depth, 0);
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
-
-    use super::CountMinSketch;
-
-    fn eval_precision(n: usize, p: f64, q: f64) -> usize {
-        // fixed value of phi for consistent test
-        let mut rng = StdRng::seed_from_u64(16180339887498948482);
-
-        #[allow(non_snake_case)]
-        let mut N = 0;
-
-        let mut ids = vec![];
-
-        for _ in 0..n {
-            // number of insert operations
-            let n = rng.gen_range(1..100);
-            // number to insert at once
-            let m = rng.gen_range(1..4096);
-
-            let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid();
-            ids.push((id, n, m));
-
-            // N = sum(actual)
-            N += n * m;
-        }
-
-        // q% of counts will be within p of the actual value
-        let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
-
-        dbg!(sketch.buckets.len());
-
-        // insert a bunch of entries in a random order
-        let mut ids2 = ids.clone();
-        while !ids2.is_empty() {
-            ids2.shuffle(&mut rng);
-
-            let mut i = 0;
-            while i < ids2.len() {
-                sketch.inc_and_return(&ids2[i].0, ids2[i].1);
-                ids2[i].2 -= 1;
-                if ids2[i].2 == 0 {
-                    ids2.remove(i);
-                } else {
-                    i += 1;
-                }
-            }
-        }
-
-        let mut within_p = 0;
-        for (id, n, m) in ids {
-            let actual = n * m;
-            let estimate = sketch.inc_and_return(&id, 0);
-
-            // This estimate has the guarantee that actual <= estimate
-            assert!(actual <= estimate);
-
-            // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ.
-            // ε = p / N, δ = 1 - q;
-            // therefore, estimate <= actual + p with probability q.
-            if estimate as f64 <= actual as f64 + p {
-                within_p += 1;
-            }
-        }
-        within_p
-    }
-
-    #[test]
-    fn precision() {
-        assert_eq!(eval_precision(100, 100.0, 0.99), 100);
-        assert_eq!(eval_precision(1000, 100.0, 0.99), 1000);
-        assert_eq!(eval_precision(100, 4096.0, 0.99), 100);
-        assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000);
-
-        // seems to be more precise than the literature indicates?
-        // probably numbers are too small to truly represent the probabilities.
-        assert_eq!(eval_precision(100, 4096.0, 0.90), 100);
-        assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000);
-        assert_eq!(eval_precision(100, 4096.0, 0.1), 98);
-        assert_eq!(eval_precision(1000, 4096.0, 0.1), 991);
-    }
-
-    // returns memory usage in bytes, and the time complexity per insert.
-    fn eval_cost(p: f64, q: f64) -> (usize, usize) {
-        #[allow(non_snake_case)]
-        // N = sum(actual)
-        // Let's assume 1021 samples, all of 4096
-        let N = 1021 * 4096;
-        let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q);
-
-        let memory = std::mem::size_of::<u32>() * sketch.buckets.len();
-        let time = sketch.depth;
-        (memory, time)
-    }
-
-    #[test]
-    fn memory_usage() {
-        assert_eq!(eval_cost(100.0, 0.99), (2273580, 5));
-        assert_eq!(eval_cost(4096.0, 0.99), (55520, 5));
-        assert_eq!(eval_cost(4096.0, 0.90), (33312, 3));
-        assert_eq!(eval_cost(4096.0, 0.1), (11104, 1));
-    }
-}
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -4,17 +4,15 @@ use std::convert::Infallible;

 use hmac::{Hmac, Mac};
 use sha2::Sha256;
+use tokio::task::yield_now;

 use super::messages::{
    ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
 };
-use super::pbkdf2::Pbkdf2;
 use super::secret::ServerSecret;
 use super::signature::SignatureBuilder;
-use super::threadpool::ThreadPool;
 use super::ScramKey;
 use crate::config;
-use crate::intern::EndpointIdInt;
 use crate::sasl::{self, ChannelBinding, Error as SaslError};

 /// The only channel binding mode we currently support.
@@ -76,18 +74,37 @@ impl<'a> Exchange<'a> {
    }
 }

+// copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
+async fn pbkdf2(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
+    let hmac = Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+    let mut prev = hmac
+        .clone()
+        .chain_update(salt)
+        .chain_update(1u32.to_be_bytes())
+        .finalize()
+        .into_bytes();
+
+    let mut hi = prev;
+
+    for i in 1..iterations {
+        prev = hmac.clone().chain_update(prev).finalize().into_bytes();
+
+        for (hi, prev) in hi.iter_mut().zip(prev) {
+            *hi ^= prev;
+        }
+        // yield every ~250us
+        // hopefully reduces tail latencies
+        if i % 1024 == 0 {
+            yield_now().await
+        }
+    }
+
+    hi.into()
+}
+
 // copied from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L236-L248>
-async fn derive_client_key(
-    pool: &ThreadPool,
-    endpoint: EndpointIdInt,
-    password: &[u8],
-    salt: &[u8],
-    iterations: u32,
-) -> ScramKey {
-    let salted_password = pool
-        .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations))
-        .await
-        .expect("job should not be cancelled");
+async fn derive_client_key(password: &[u8], salt: &[u8], iterations: u32) -> ScramKey {
+    let salted_password = pbkdf2(password, salt, iterations).await;

    let make_key = |name| {
        let key = Hmac::<Sha256>::new_from_slice(&salted_password)
@@ -102,13 +119,11 @@ async fn derive_client_key(
 }

 pub async fn exchange(
-    pool: &ThreadPool,
-    endpoint: EndpointIdInt,
    secret: &ServerSecret,
    password: &[u8],
 ) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
    let salt = base64::decode(&secret.salt_base64)?;
-    let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await;
+    let client_key = derive_client_key(password, &salt, secret.iterations).await;

    if secret.is_password_invalid(&client_key).into() {
        Ok(sasl::Outcome::Failure("password doesn't match"))
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -1,89 +0,0 @@
-use hmac::{
-    digest::{consts::U32, generic_array::GenericArray},
-    Hmac, Mac,
-};
-use sha2::Sha256;
-
-pub struct Pbkdf2 {
-    hmac: Hmac<Sha256>,
-    prev: GenericArray<u8, U32>,
-    hi: GenericArray<u8, U32>,
-    iterations: u32,
-}
-
-// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
-impl Pbkdf2 {
-    pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
-        let hmac =
-            Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-
-        let prev = hmac
-            .clone()
-            .chain_update(salt)
-            .chain_update(1u32.to_be_bytes())
-            .finalize()
-            .into_bytes();
-
-        Self {
-            hmac,
-            // one consumed for the hash above
-            iterations: iterations - 1,
-            hi: prev,
-            prev,
-        }
-    }
-
-    pub fn cost(&self) -> u32 {
-        (self.iterations).clamp(0, 4096)
-    }
-
-    pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> {
-        let Self {
-            hmac,
-            prev,
-            hi,
-            iterations,
-        } = self;
-
-        // only do 4096 iterations per turn before sharing the thread for fairness
-        let n = (*iterations).clamp(0, 4096);
-        for _ in 0..n {
-            *prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
-
-            for (hi, prev) in hi.iter_mut().zip(*prev) {
-                *hi ^= prev;
-            }
-        }
-
-        *iterations -= n;
-        if *iterations == 0 {
-            std::task::Poll::Ready((*hi).into())
-        } else {
-            std::task::Poll::Pending
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::Pbkdf2;
-    use pbkdf2::pbkdf2_hmac_array;
-    use sha2::Sha256;
-
-    #[test]
-    fn works() {
-        let salt = b"sodium chloride";
-        let pass = b"Ne0n_!5_50_C007";
-
-        let mut job = Pbkdf2::start(pass, salt, 600000);
-        let hash = loop {
-            let std::task::Poll::Ready(hash) = job.turn() else {
-                continue;
-            };
-            break hash;
-        };
-
-        let expected = pbkdf2_hmac_array::<Sha256, 32>(pass, salt, 600000);
-        assert_eq!(hash, expected)
-    }
-}
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -1,321 +0,0 @@
-//! Custom threadpool implementation for password hashing.
-//!
-//! Requirements:
-//! 1. Fairness per endpoint.
-//! 2. Yield support for high iteration counts.
-
-use std::sync::{
-    atomic::{AtomicU64, Ordering},
-    Arc,
-};
-
-use crossbeam_deque::{Injector, Stealer, Worker};
-use itertools::Itertools;
-use parking_lot::{Condvar, Mutex};
-use rand::Rng;
-use rand::{rngs::SmallRng, SeedableRng};
-use tokio::sync::oneshot;
-
-use crate::{
-    intern::EndpointIdInt,
-    metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
-    scram::countmin::CountMinSketch,
-};
-
-use super::pbkdf2::Pbkdf2;
-
-pub struct ThreadPool {
-    queue: Injector<JobSpec>,
-    stealers: Vec<Stealer<JobSpec>>,
-    parkers: Vec<(Condvar, Mutex<ThreadState>)>,
-    /// bitpacked representation.
-    /// lower 8 bits = number of sleeping threads
-    /// next 8 bits = number of idle threads (searching for work)
-    counters: AtomicU64,
-
-    pub metrics: Arc<ThreadPoolMetrics>,
-}
-
-#[derive(PartialEq)]
-enum ThreadState {
-    Parked,
-    Active,
-}
-
-impl ThreadPool {
-    pub fn new(n_workers: u8) -> Arc<Self> {
-        let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec();
-        let stealers = workers.iter().map(|w| w.stealer()).collect_vec();
-
-        let parkers = (0..n_workers)
-            .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active)))
-            .collect_vec();
-
-        let pool = Arc::new(Self {
-            queue: Injector::new(),
-            stealers,
-            parkers,
-            // threads start searching for work
-            counters: AtomicU64::new((n_workers as u64) << 8),
-            metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)),
-        });
-
-        for (i, worker) in workers.into_iter().enumerate() {
-            let pool = Arc::clone(&pool);
-            std::thread::spawn(move || thread_rt(pool, worker, i));
-        }
-
-        pool
-    }
-
-    pub fn spawn_job(
-        &self,
-        endpoint: EndpointIdInt,
-        pbkdf2: Pbkdf2,
-    ) -> oneshot::Receiver<[u8; 32]> {
-        let (tx, rx) = oneshot::channel();
-
-        let queue_was_empty = self.queue.is_empty();
-
-        self.metrics.injector_queue_depth.inc();
-        self.queue.push(JobSpec {
-            response: tx,
-            pbkdf2,
-            endpoint,
-        });
-
-        // inspired from <https://github.com/rayon-rs/rayon/blob/3e3962cb8f7b50773bcc360b48a7a674a53a2c77/rayon-core/src/sleep/mod.rs#L242>
-        let counts = self.counters.load(Ordering::SeqCst);
-        let num_awake_but_idle = (counts >> 8) & 0xff;
-        let num_sleepers = counts & 0xff;
-
-        // If the queue is non-empty, then we always wake up a worker
-        // -- clearly the existing idle jobs aren't enough. Otherwise,
-        // check to see if we have enough idle workers.
-        if !queue_was_empty || num_awake_but_idle == 0 {
-            let num_to_wake = Ord::min(1, num_sleepers);
-            self.wake_any_threads(num_to_wake);
-        }
-
-        rx
-    }
-
-    #[cold]
-    fn wake_any_threads(&self, mut num_to_wake: u64) {
-        if num_to_wake > 0 {
-            for i in 0..self.parkers.len() {
-                if self.wake_specific_thread(i) {
-                    num_to_wake -= 1;
-                    if num_to_wake == 0 {
-                        return;
-                    }
-                }
-            }
-        }
-    }
-
-    fn wake_specific_thread(&self, index: usize) -> bool {
-        let (condvar, lock) = &self.parkers[index];
-
-        let mut state = lock.lock();
-        if *state == ThreadState::Parked {
-            condvar.notify_one();
-
-            // When the thread went to sleep, it will have incremented
-            // this value. When we wake it, its our job to decrement
-            // it. We could have the thread do it, but that would
-            // introduce a delay between when the thread was
-            // *notified* and when this counter was decremented. That
-            // might mislead people with new work into thinking that
-            // there are sleeping threads that they should try to
-            // wake, when in fact there is nothing left for them to
-            // do.
-            self.counters.fetch_sub(1, Ordering::SeqCst);
-            *state = ThreadState::Active;
-
-            true
-        } else {
-            false
-        }
-    }
-
-    fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker<JobSpec>) -> Option<JobSpec> {
-        // announce thread as idle
-        self.counters.fetch_add(256, Ordering::SeqCst);
-
-        // try steal from the global queue
-        loop {
-            match self.queue.steal_batch_and_pop(worker) {
-                crossbeam_deque::Steal::Success(job) => {
-                    self.metrics
-                        .injector_queue_depth
-                        .set(self.queue.len() as i64);
-                    // no longer idle
-                    self.counters.fetch_sub(256, Ordering::SeqCst);
-                    return Some(job);
-                }
-                crossbeam_deque::Steal::Retry => continue,
-                crossbeam_deque::Steal::Empty => break,
-            }
-        }
-
-        // try steal from our neighbours
-        loop {
-            let mut retry = false;
-            let start = rng.gen_range(0..self.stealers.len());
-            let job = (start..self.stealers.len())
-                .chain(0..start)
-                .filter(|i| *i != skip)
-                .find_map(
-                    |victim| match self.stealers[victim].steal_batch_and_pop(worker) {
-                        crossbeam_deque::Steal::Success(job) => Some(job),
-                        crossbeam_deque::Steal::Empty => None,
-                        crossbeam_deque::Steal::Retry => {
-                            retry = true;
-                            None
-                        }
-                    },
-                );
-            if job.is_some() {
-                // no longer idle
-                self.counters.fetch_sub(256, Ordering::SeqCst);
-                return job;
-            }
-            if !retry {
-                return None;
-            }
-        }
-    }
-}
-
-fn thread_rt(pool: Arc<ThreadPool>, worker: Worker<JobSpec>, index: usize) {
-    /// interval when we should steal from the global queue
-    /// so that tail latencies are managed appropriately
-    const STEAL_INTERVAL: usize = 61;
-
-    /// How often to reset the sketch values
-    const SKETCH_RESET_INTERVAL: usize = 1021;
-
-    let mut rng = SmallRng::from_entropy();
-
-    // used to determine whether we should temporarily skip tasks for fairness.
-    // 99% of estimates will overcount by no more than 4096 samples
-    let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01);
-
-    let (condvar, lock) = &pool.parkers[index];
-
-    'wait: loop {
-        // wait for notification of work
-        {
-            let mut lock = lock.lock();
-
-            // queue is empty
-            pool.metrics
-                .worker_queue_depth
-                .set(ThreadPoolWorkerId(index), 0);
-
-            // subtract 1 from idle count, add 1 to sleeping count.
-            pool.counters.fetch_sub(255, Ordering::SeqCst);
-
-            *lock = ThreadState::Parked;
-            condvar.wait(&mut lock);
-        }
-
-        for i in 0.. {
-            let mut job = match worker
-                .pop()
-                .or_else(|| pool.steal(&mut rng, index, &worker))
-            {
-                Some(job) => job,
-                None => continue 'wait,
-            };
-
-            pool.metrics
-                .worker_queue_depth
-                .set(ThreadPoolWorkerId(index), worker.len() as i64);
-
-            // receiver is closed, cancel the task
-            if !job.response.is_closed() {
-                let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost());
-
-                const P: f64 = 2000.0;
-                // probability decreases as rate increases.
-                // lower probability, higher chance of being skipped
-                //
-                // estimates (rate in terms of 4096 rounds):
-                // rate = 0    => probability = 100%
-                // rate = 10   => probability = 71.3%
-                // rate = 50   => probability = 62.1%
-                // rate = 500  => probability = 52.3%
-                // rate = 1021 => probability = 49.8%
-                //
-                // My expectation is that the pool queue will only begin backing up at ~1000rps
-                // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above
-                // are in requests per second.
-                let probability = P.ln() / (P + rate as f64).ln();
-                if pool.queue.len() > 32 || rng.gen_bool(probability) {
-                    pool.metrics
-                        .worker_task_turns_total
-                        .inc(ThreadPoolWorkerId(index));
-
-                    match job.pbkdf2.turn() {
-                        std::task::Poll::Ready(result) => {
-                            let _ = job.response.send(result);
-                        }
-                        std::task::Poll::Pending => worker.push(job),
-                    }
-                } else {
-                    pool.metrics
-                        .worker_task_skips_total
-                        .inc(ThreadPoolWorkerId(index));
-
-                    // skip for now
-                    worker.push(job)
-                }
-            }
-
-            // if we get stuck with a few long lived jobs in the queue
-            // it's better to try and steal from the queue too for fairness
-            if i % STEAL_INTERVAL == 0 {
-                let _ = pool.queue.steal_batch(&worker);
-            }
-
-            if i % SKETCH_RESET_INTERVAL == 0 {
-                sketch.reset();
-            }
-        }
-    }
-}
-
-struct JobSpec {
-    response: oneshot::Sender<[u8; 32]>,
-    pbkdf2: Pbkdf2,
-    endpoint: EndpointIdInt,
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::EndpointId;
-
-    use super::*;
-
-    #[tokio::test]
-    async fn hash_is_correct() {
-        let pool = ThreadPool::new(1);
-
-        let ep = EndpointId::from("foo");
-        let ep = EndpointIdInt::from(ep);
-
-        let salt = [0x55; 32];
-        let actual = pool
-            .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096))
-            .await
-            .unwrap();
-
-        let expected = [
-            10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242,
-            178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140,
-        ];
-        assert_eq!(actual, expected)
-    }
-}
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -15,7 +15,6 @@ use crate::{
    },
    context::RequestMonitoring,
    error::{ErrorKind, ReportableError, UserFacingError},
-    intern::EndpointIdInt,
    proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry},
    rate_limiter::EndpointRateLimiter,
    Host,
@@ -67,14 +66,8 @@ impl PoolingBackend {
                return Err(AuthError::auth_failed(&*user_info.user));
            }
        };
-        let ep = EndpointIdInt::from(&conn_info.user_info.endpoint);
-        let auth_outcome = crate::auth::validate_password_and_exchange(
-            &config.thread_pool,
-            ep,
-            &conn_info.password,
-            secret,
-        )
-        .await?;
+        let auth_outcome =
+            crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
        let res = match auth_outcome {
            crate::sasl::Outcome::Success(key) => {
                info!("user successfully authenticated");
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
-requests = "^2.32.0"
+requests = "^2.31.0"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -22,7 +22,8 @@ serde_with.workspace = true
 workspace_hack.workspace = true
 utils.workspace = true
 async-stream.workspace = true
-tokio-postgres-rustls.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
 postgres_ffi.workspace = true
 tokio-stream.workspace = true
 tokio-postgres.workspace = true
@@ -30,8 +31,6 @@ tokio-util = { workspace = true }
 futures-util.workspace = true
 itertools.workspace = true
 camino.workspace = true
-rustls.workspace = true
-webpki-roots.workspace = true

 tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
 chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet};

 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
 use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
@@ -208,7 +208,7 @@ impl TenantObjectListing {
        &mut self,
        timeline_id: TimelineId,
        layer_file: &LayerName,
-        metadata: &LayerFileMetadata,
+        metadata: &IndexLayerMetadata,
    ) -> bool {
        let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
            return false;
--- a/s3_scrubber/src/scan_safekeeper_metadata.rs
+++ b/s3_scrubber/src/scan_safekeeper_metadata.rs
@@ -71,13 +71,8 @@ pub async fn scan_safekeeper_metadata(
        bucket_config.bucket, bucket_config.region, dump_db_table
    );
    // Use the native TLS implementation (Neon requires TLS)
-    let root_store = rustls::RootCertStore {
-        roots: webpki_roots::TLS_SERVER_ROOTS.to_vec(),
-    };
-    let client_config = rustls::ClientConfig::builder()
-        .with_root_certificates(root_store)
-        .with_no_client_auth();
-    let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+    let tls_connector =
+        postgres_native_tls::MakeTlsConnector::new(native_tls::TlsConnector::new().unwrap());
    let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
    // The connection object performs the actual communication with the database,
    // so spawn it off to run on its own.
--- a/s3_scrubber/src/tenant_snapshot.rs
+++ b/s3_scrubber/src/tenant_snapshot.rs
@@ -11,7 +11,7 @@ use async_stream::stream;
 use aws_sdk_s3::Client;
 use camino::Utf8PathBuf;
 use futures::{StreamExt, TryStreamExt};
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::shard::TenantShardId;
@@ -49,8 +49,8 @@ impl SnapshotDownloader {
        &self,
        ttid: TenantShardTimelineId,
        layer_name: LayerName,
-        layer_metadata: LayerFileMetadata,
-    ) -> anyhow::Result<(LayerName, LayerFileMetadata)> {
+        layer_metadata: IndexLayerMetadata,
+    ) -> anyhow::Result<(LayerName, IndexLayerMetadata)> {
        // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format.  They use
        // different layer names (remote-style has the generation suffix)
        let local_path = self.output_path.join(format!(
@@ -110,7 +110,7 @@ impl SnapshotDownloader {
    async fn download_layers(
        &self,
        ttid: TenantShardTimelineId,
-        layers: Vec<(LayerName, LayerFileMetadata)>,
+        layers: Vec<(LayerName, IndexLayerMetadata)>,
    ) -> anyhow::Result<()> {
        let layer_count = layers.len();
        tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count);
@@ -161,7 +161,10 @@ impl SnapshotDownloader {
        ttid: TenantShardTimelineId,
        index_part: Box<IndexPart>,
        index_part_generation: Generation,
-        ancestor_layers: &mut HashMap<TenantShardTimelineId, HashMap<LayerName, LayerFileMetadata>>,
+        ancestor_layers: &mut HashMap<
+            TenantShardTimelineId,
+            HashMap<LayerName, IndexLayerMetadata>,
+        >,
    ) -> anyhow::Result<()> {
        let index_bytes = serde_json::to_string(&index_part).unwrap();

@@ -231,7 +234,7 @@ impl SnapshotDownloader {
        // happen if this tenant has been split at some point)
        let mut ancestor_layers: HashMap<
            TenantShardTimelineId,
-            HashMap<LayerName, LayerFileMetadata>,
+            HashMap<LayerName, IndexLayerMetadata>,
        > = Default::default();

        for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) {
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -20,6 +20,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
+use tokio::sync::mpsc;

 use tracing::*;
 use utils::pid_file;
@@ -29,13 +30,13 @@ use safekeeper::defaults::{
    DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
-use safekeeper::remove_wal;
 use safekeeper::wal_service;
 use safekeeper::GlobalTimelines;
 use safekeeper::SafeKeeperConf;
 use safekeeper::{broker, WAL_SERVICE_RUNTIME};
 use safekeeper::{control_file, BROKER_RUNTIME};
 use safekeeper::{http, WAL_REMOVER_RUNTIME};
+use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
@@ -376,6 +377,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
    metrics::register_internal(Box::new(timeline_collector))?;

+    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
+
    wal_backup::init_remote_storage(&conf);

    // Keep handles to main tasks to die if any of them disappears.
@@ -388,9 +391,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
    let current_thread_rt = conf
        .current_thread_runtime
        .then(|| Handle::try_current().expect("no runtime in main"));
+    let conf_ = conf.clone();
+    let wal_backup_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
+        .spawn(wal_backup::wal_backup_launcher_task_main(
+            conf_,
+            wal_backup_launcher_rx,
+        ))
+        .map(|res| ("WAL backup launcher".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_backup_handle));

    // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone()).await?;
+    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;

    let conf_ = conf.clone();
    // Run everything in current thread rt, if asked.
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -46,8 +46,6 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
        return Ok(());
    }

-    let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
-
    let mut client =
        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
    let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
@@ -59,9 +57,15 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
            // sensitive and there is no risk of deadlock as we don't await while
            // lock is held.
            let now = Instant::now();
-            let all_tlis = active_timelines_set.get_all();
+            let all_tlis = GlobalTimelines::get_all();
            let mut n_pushed_tlis = 0;
            for tli in &all_tlis {
+                // filtering alternative futures::stream::iter(all_tlis)
+                //   .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
+                // doesn't look better, and I'm not sure how to do that without collect.
+                if !tli.is_active().await {
+                    continue;
+                }
                let sk_info = tli.get_safekeeper_info(&conf).await;
                yield sk_info;
                BROKER_PUSHED_UPDATES.inc();
@@ -86,7 +90,6 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 }

 /// Subscribe and fetch all the interesting data from the broker.
-#[instrument(name = "broker pull", skip_all)]
 async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;

@@ -183,7 +186,6 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
                        commit_lsn: sk_info.commit_lsn,
                        safekeeper_connstr: sk_info.safekeeper_connstr,
                        availability_zone: sk_info.availability_zone,
-                        standby_horizon: 0,
                    };

                    // note this is a blocking call
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -350,7 +350,6 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
        backup_lsn: sk_info.backup_lsn.0,
        local_start_lsn: sk_info.local_start_lsn.0,
        availability_zone: None,
-        standby_horizon: sk_info.standby_horizon.0,
    };

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -31,8 +31,6 @@ pub mod safekeeper;
 pub mod send_wal;
 pub mod state;
 pub mod timeline;
-pub mod timeline_manager;
-pub mod timelines_set;
 pub mod wal_backup;
 pub mod wal_backup_partial;
 pub mod wal_service;
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -11,9 +11,8 @@ use futures::Future;
 use metrics::{
    core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
    proto::MetricFamily,
-    register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
-    register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
-    IntGaugeVec,
+    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
+    IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 use once_cell::sync::Lazy;

@@ -163,29 +162,6 @@ pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter")
 });
-pub static MANAGER_ITERATIONS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "safekeeper_manager_iterations_total",
-        "Number of iterations of the timeline manager task"
-    )
-    .expect("Failed to register safekeeper_manager_iterations_total counter")
-});
-pub static MANAGER_ACTIVE_CHANGES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "safekeeper_manager_active_changes_total",
-        "Number of timeline active status changes in the timeline manager task"
-    )
-    .expect("Failed to register safekeeper_manager_active_changes_total counter")
-});
-pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "safekeeper_wal_backup_tasks_started_total",
-        "Number of active WAL backup tasks",
-        "safekeeper_wal_backup_tasks_finished_total",
-        "Number of finished WAL backup tasks",
-    )
-    .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
-});

 pub const LABEL_UNKNOWN: &str = "unknown";

@@ -638,7 +614,8 @@ impl Collector for TimelineCollector {
        self.written_wal_seconds.reset();
        self.flushed_wal_seconds.reset();

-        let timelines_count = GlobalTimelines::get_all().len();
+        let timelines = GlobalTimelines::get_all();
+        let timelines_count = timelines.len();
        let mut active_timelines_count = 0;

        // Prometheus Collector is sync, and data is stored under async lock. To
@@ -769,9 +746,9 @@ impl Collector for TimelineCollector {

 async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
    let mut res = vec![];
-    let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
+    let timelines = GlobalTimelines::get_all();

-    for tli in active_timelines {
+    for tli in timelines {
        if let Some(info) = tli.info_for_metrics().await {
            res.push(info);
        }
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -45,9 +45,6 @@ const DEFAULT_FEEDBACK_CAPACITY: usize = 8;
 pub struct WalReceivers {
    mutex: Mutex<WalReceiversShared>,
    pageserver_feedback_tx: tokio::sync::broadcast::Sender<PageserverFeedback>,
-
-    num_computes_tx: tokio::sync::watch::Sender<usize>,
-    num_computes_rx: tokio::sync::watch::Receiver<usize>,
 }

 /// Id under which walreceiver is registered in shmem.
@@ -58,21 +55,16 @@ impl WalReceivers {
        let (pageserver_feedback_tx, _) =
            tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY);

-        let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize);
-
        Arc::new(WalReceivers {
            mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }),
            pageserver_feedback_tx,
-            num_computes_tx,
-            num_computes_rx,
        })
    }

    /// Register new walreceiver. Returned guard provides access to the slot and
    /// automatically deregisters in Drop.
    pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
-        let mut shared = self.mutex.lock();
-        let slots = &mut shared.slots;
+        let slots = &mut self.mutex.lock().slots;
        let walreceiver = WalReceiverState {
            conn_id,
            status: WalReceiverStatus::Voting,
@@ -86,9 +78,6 @@ impl WalReceivers {
            slots.push(Some(walreceiver));
            pos
        };
-
-        self.update_num(&shared);
-
        WalReceiverGuard {
            id: pos,
            walreceivers: self.clone(),
@@ -110,18 +99,7 @@ impl WalReceivers {

    /// Get number of walreceivers (compute connections).
    pub fn get_num(self: &Arc<WalReceivers>) -> usize {
-        self.mutex.lock().get_num()
-    }
-
-    /// Get channel for number of walreceivers.
-    pub fn get_num_rx(self: &Arc<WalReceivers>) -> tokio::sync::watch::Receiver<usize> {
-        self.num_computes_rx.clone()
-    }
-
-    /// Should get called after every update of slots.
-    fn update_num(self: &Arc<WalReceivers>, shared: &MutexGuard<WalReceiversShared>) {
-        let num = shared.get_num();
-        self.num_computes_tx.send_replace(num);
+        self.mutex.lock().slots.iter().flatten().count()
    }

    /// Get state of all walreceivers.
@@ -145,7 +123,6 @@ impl WalReceivers {
    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
-        self.update_num(&shared);
    }

    /// Broadcast pageserver feedback to connected walproposers.
@@ -160,13 +137,6 @@ struct WalReceiversShared {
    slots: Vec<Option<WalReceiverState>>,
 }

-impl WalReceiversShared {
-    /// Get number of walreceivers (compute connections).
-    fn get_num(&self) -> usize {
-        self.slots.iter().flatten().count()
-    }
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalReceiverState {
    /// None means it is recovery initiated by us (this safekeeper).
@@ -486,7 +456,14 @@ impl WalAcceptor {
    /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
    /// it must mean that network thread terminated.
    async fn run(&mut self) -> anyhow::Result<()> {
+        // Register the connection and defer unregister.
+        // Order of the next two lines is important: we want first to remove our entry and then
+        // update status which depends on registered connections.
+        let _compute_conn_guard = ComputeConnectionGuard {
+            timeline: Arc::clone(&self.tli),
+        };
        let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
+        self.tli.update_status_notify().await?;

        // After this timestamp we will stop processing AppendRequests and send a response
        // to the walproposer. walproposer sends at least one AppendRequest per second,
@@ -552,3 +529,19 @@ impl WalAcceptor {
        }
    }
 }
+
+/// Calls update_status_notify in drop to update timeline status.
+struct ComputeConnectionGuard {
+    timeline: Arc<Timeline>,
+}
+
+impl Drop for ComputeConnectionGuard {
+    fn drop(&mut self) {
+        let tli = self.timeline.clone();
+        tokio::spawn(async move {
+            if let Err(e) = tli.update_status_notify().await {
+                error!("failed to update timeline status: {}", e);
+            }
+        });
+    }
+}
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -37,11 +37,17 @@ use crate::{
 #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
 pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    info!("started");
+    let mut cancellation_rx = match tli.get_cancellation_rx() {
+        Ok(rx) => rx,
+        Err(_) => {
+            info!("timeline canceled during task start");
+            return;
+        }
+    };

-    let cancel = tli.cancel.clone();
    select! {
        _ = recovery_main_loop(tli, conf) => { unreachable!() }
-        _ = cancel.cancelled() => {
+        _ = cancellation_rx.changed() => {
            info!("stopped");
        }
    }
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -7,18 +7,29 @@ use tracing::*;

 use crate::{GlobalTimelines, SafeKeeperConf};

-pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {
+const ALLOW_INACTIVE_TIMELINES: bool = true;
+
+pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    let wal_removal_interval = Duration::from_millis(5000);
    loop {
        let now = tokio::time::Instant::now();
+        let mut active_timelines = 0;
+
        let tlis = GlobalTimelines::get_all();
        for tli in &tlis {
+            let is_active = tli.is_active().await;
+            if is_active {
+                active_timelines += 1;
+            }
+            if !ALLOW_INACTIVE_TIMELINES && !is_active {
+                continue;
+            }
            let ttid = tli.ttid;
            async {
                if let Err(e) = tli.maybe_persist_control_file().await {
                    warn!("failed to persist control file: {e}");
                }
-                if let Err(e) = tli.remove_old_wal().await {
+                if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await {
                    error!("failed to remove WAL: {}", e);
                }
            }
@@ -31,8 +42,8 @@ pub async fn task_main(_conf: SafeKeeperConf) -> anyhow::Result<()> {

        if elapsed > wal_removal_interval {
            info!(
-                "WAL removal is too long, processed {} timelines in {:?}",
-                total_timelines, elapsed
+                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
+                active_timelines, total_timelines, elapsed
            );
        }

--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -827,10 +827,10 @@ where

    /// Persist control file if there is something to save and enough time
    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<bool> {
+    pub async fn maybe_persist_inmem_control_file(&mut self) -> Result<()> {
        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
        if self.state.pers.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
-            return Ok(false);
+            return Ok(());
        }
        let need_persist = self.state.inmem.commit_lsn > self.state.commit_lsn
            || self.state.inmem.backup_lsn > self.state.backup_lsn
@@ -840,7 +840,7 @@ where
            self.state.flush().await?;
            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
        }
-        Ok(need_persist)
+        Ok(())
    }

    /// Handle request to append WAL.
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -23,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;

-use std::cmp::{max, min};
+use std::cmp::min;
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -85,17 +85,8 @@ impl StandbyReply {

 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct StandbyFeedback {
-    pub reply: StandbyReply,
-    pub hs_feedback: HotStandbyFeedback,
-}
-
-impl StandbyFeedback {
-    pub fn empty() -> Self {
-        StandbyFeedback {
-            reply: StandbyReply::empty(),
-            hs_feedback: HotStandbyFeedback::empty(),
-        }
-    }
+    reply: StandbyReply,
+    hs_feedback: HotStandbyFeedback,
 }

 /// WalSenders registry. Timeline holds it (wrapped in Arc).
@@ -171,8 +162,8 @@ impl WalSenders {
    }

    /// Get aggregated hot standby feedback (we send it to compute).
-    pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
-        self.mutex.lock().agg_standby_feedback
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
+        self.mutex.lock().agg_hs_feedback
    }

    /// Record new pageserver feedback, update aggregated values.
@@ -193,10 +184,6 @@ impl WalSenders {
    fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
        let mut shared = self.mutex.lock();
        let slot = shared.get_slot_mut(id);
-        debug!(
-            "Record standby reply: ts={} apply_lsn={}",
-            reply.reply_ts, reply.apply_lsn
-        );
        match &mut slot.feedback {
            ReplicationFeedback::Standby(sf) => sf.reply = *reply,
            ReplicationFeedback::Pageserver(_) => {
@@ -221,7 +208,7 @@ impl WalSenders {
                })
            }
        }
-        shared.update_reply_feedback();
+        shared.update_hs_feedback();
    }

    /// Get remote_consistent_lsn reported by the pageserver. Returns None if
@@ -239,13 +226,13 @@ impl WalSenders {
    fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
-        shared.update_reply_feedback();
+        shared.update_hs_feedback();
    }
 }

 struct WalSendersShared {
    // aggregated over all walsenders value
-    agg_standby_feedback: StandbyFeedback,
+    agg_hs_feedback: HotStandbyFeedback,
    // last feedback ever received from any pageserver, empty if none
    last_ps_feedback: PageserverFeedback,
    // total counter of pageserver feedbacks received
@@ -256,7 +243,7 @@ struct WalSendersShared {
 impl WalSendersShared {
    fn new() -> Self {
        WalSendersShared {
-            agg_standby_feedback: StandbyFeedback::empty(),
+            agg_hs_feedback: HotStandbyFeedback::empty(),
            last_ps_feedback: PageserverFeedback::empty(),
            ps_feedback_counter: 0,
            slots: Vec::new(),
@@ -273,11 +260,10 @@ impl WalSendersShared {
        self.slots[id].as_mut().expect("walsender doesn't exist")
    }

-    /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
+    /// Update aggregated hot standy feedback. We just take min of valid xmins
    /// and ts.
-    fn update_reply_feedback(&mut self) {
+    fn update_hs_feedback(&mut self) {
        let mut agg = HotStandbyFeedback::empty();
-        let mut reply_agg = StandbyReply::empty();
        for ws_state in self.slots.iter().flatten() {
            if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
                let hs_feedback = standby_feedback.hs_feedback;
@@ -290,7 +276,7 @@ impl WalSendersShared {
                    } else {
                        agg.xmin = hs_feedback.xmin;
                    }
-                    agg.ts = max(agg.ts, hs_feedback.ts);
+                    agg.ts = min(agg.ts, hs_feedback.ts);
                }
                if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
                    if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
@@ -298,43 +284,11 @@ impl WalSendersShared {
                    } else {
                        agg.catalog_xmin = hs_feedback.catalog_xmin;
                    }
-                    agg.ts = max(agg.ts, hs_feedback.ts);
-                }
-                let reply = standby_feedback.reply;
-                if reply.write_lsn != Lsn::INVALID {
-                    if reply_agg.write_lsn != Lsn::INVALID {
-                        reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
-                    } else {
-                        reply_agg.write_lsn = reply.write_lsn;
-                    }
-                }
-                if reply.flush_lsn != Lsn::INVALID {
-                    if reply_agg.flush_lsn != Lsn::INVALID {
-                        reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
-                    } else {
-                        reply_agg.flush_lsn = reply.flush_lsn;
-                    }
-                }
-                if reply.apply_lsn != Lsn::INVALID {
-                    if reply_agg.apply_lsn != Lsn::INVALID {
-                        reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
-                    } else {
-                        reply_agg.apply_lsn = reply.apply_lsn;
-                    }
-                }
-                if reply.reply_ts != 0 {
-                    if reply_agg.reply_ts != 0 {
-                        reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
-                    } else {
-                        reply_agg.reply_ts = reply.reply_ts;
-                    }
+                    agg.ts = min(agg.ts, hs_feedback.ts);
                }
            }
        }
-        self.agg_standby_feedback = StandbyFeedback {
-            reply: reply_agg,
-            hs_feedback: agg,
-        };
+        self.agg_hs_feedback = agg;
    }
 }

@@ -756,15 +710,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
        match msg.first().cloned() {
            Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
                // Note: deserializing is on m[1..] because we skip the tag byte.
-                let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
+                let hs_feedback = HotStandbyFeedback::des(&msg[1..])
                    .context("failed to deserialize HotStandbyFeedback")?;
-                // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
-                // pq_sendint32(&reply_message, xmin);
-                // pq_sendint32(&reply_message, xmin_epoch);
-                // So it is two big endian 32-bit words in low endian order!
-                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
-                hs_feedback.catalog_xmin =
-                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
                self.ws_guard
                    .walsenders
                    .record_hs_feedback(self.ws_guard.id, &hs_feedback);
@@ -846,11 +793,8 @@ mod tests {
    fn test_hs_feedback_no_valid() {
        let mut wss = WalSendersShared::new();
        push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
-        wss.update_reply_feedback();
-        assert_eq!(
-            wss.agg_standby_feedback.hs_feedback.xmin,
-            INVALID_FULL_TRANSACTION_ID
-        );
+        wss.update_hs_feedback();
+        assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
    }

    #[test]
@@ -859,7 +803,7 @@ mod tests {
        push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
        push_feedback(&mut wss, hs_feedback(1, 42));
        push_feedback(&mut wss, hs_feedback(1, 64));
-        wss.update_reply_feedback();
-        assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
+        wss.update_hs_feedback();
+        assert_eq!(wss.agg_hs_feedback.xmin, 42);
    }
 }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -6,15 +6,15 @@ use camino::Utf8PathBuf;
 use postgres_ffi::XLogSegNo;
 use serde::{Deserialize, Serialize};
 use tokio::fs;
-use tokio_util::sync::CancellationToken;

 use std::cmp::max;
-use std::ops::{Deref, DerefMut};
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
-use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
-use tokio::{sync::watch, time::Instant};
+use tokio::sync::{Mutex, MutexGuard};
+use tokio::{
+    sync::{mpsc::Sender, watch},
+    time::Instant,
+};
 use tracing::*;
 use utils::http::error::ApiError;
 use utils::{
@@ -33,13 +33,12 @@ use crate::safekeeper::{
 };
 use crate::send_wal::WalSenders;
 use crate::state::{TimelineMemState, TimelinePersistentState};
-use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};

 use crate::metrics::FullTimelineInfo;
 use crate::wal_storage::Storage as wal_storage_iface;
-use crate::{debug_dump, timeline_manager, wal_backup_partial, wal_storage};
+use crate::{debug_dump, wal_backup_partial, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};

 /// Things safekeeper should know about timeline state on peers.
@@ -52,7 +51,8 @@ pub struct PeerInfo {
    /// LSN of the last record.
    pub flush_lsn: Lsn,
    pub commit_lsn: Lsn,
-    /// Since which LSN safekeeper has WAL.
+    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
+    /// sk since backup_lsn.
    pub local_start_lsn: Lsn,
    /// When info was received. Serde annotations are not very useful but make
    /// the code compile -- we don't rely on this field externally.
@@ -97,79 +97,25 @@ impl PeersInfo {
    }
 }

-pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
-
-/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard<SharedState>` that
-/// automatically updates `watch::Sender` channels with state on drop.
-pub struct WriteGuardSharedState<'a> {
-    tli: Arc<Timeline>,
-    guard: RwLockWriteGuard<'a, SharedState>,
-    skip_update: bool,
-}
-
-impl<'a> WriteGuardSharedState<'a> {
-    fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState {
-            tli,
-            guard,
-            skip_update: false,
-        }
-    }
-}
-
-impl<'a> Deref for WriteGuardSharedState<'a> {
-    type Target = SharedState;
-
-    fn deref(&self) -> &Self::Target {
-        &self.guard
-    }
-}
-
-impl<'a> DerefMut for WriteGuardSharedState<'a> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.guard
-    }
-}
-
-impl<'a> Drop for WriteGuardSharedState<'a> {
-    fn drop(&mut self) {
-        let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn()));
-        let commit_lsn = self.guard.sk.state.inmem.commit_lsn;
-
-        let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| {
-            if *old != term_flush_lsn {
-                *old = term_flush_lsn;
-                true
-            } else {
-                false
-            }
-        });
-
-        let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| {
-            if *old != commit_lsn {
-                *old = commit_lsn;
-                true
-            } else {
-                false
-            }
-        });
-
-        if !self.skip_update {
-            // send notification about shared state update
-            self.tli.shared_state_version_tx.send_modify(|old| {
-                *old += 1;
-            });
-        }
-    }
-}
-
 /// Shared state associated with database instance
 pub struct SharedState {
    /// Safekeeper object
-    pub(crate) sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
+    sk: SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>,
    /// In memory list containing state of peers sent in latest messages from them.
-    pub(crate) peers_info: PeersInfo,
-    pub(crate) last_removed_segno: XLogSegNo,
+    peers_info: PeersInfo,
+    /// True when WAL backup launcher oversees the timeline, making sure WAL is
+    /// offloaded, allows to bother launcher less.
+    wal_backup_active: bool,
+    /// True whenever there is at least some pending activity on timeline: live
+    /// compute connection, pageserver is not caughtup (it must have latest WAL
+    /// for new compute start) or WAL backuping is not finished. Practically it
+    /// means safekeepers broadcast info to peers about the timeline, old WAL is
+    /// trimmed.
+    ///
+    /// TODO: it might be better to remove tli completely from GlobalTimelines
+    /// when tli is inactive instead of having this flag.
+    active: bool,
+    last_removed_segno: XLogSegNo,
 }

 impl SharedState {
@@ -206,6 +152,8 @@ impl SharedState {
        Ok(Self {
            sk,
            peers_info: PeersInfo(vec![]),
+            wal_backup_active: false,
+            active: false,
            last_removed_segno: 0,
        })
    }
@@ -223,10 +171,75 @@ impl SharedState {
        Ok(Self {
            sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?,
            peers_info: PeersInfo(vec![]),
+            wal_backup_active: false,
+            active: false,
            last_removed_segno: 0,
        })
    }

+    fn is_active(&self, num_computes: usize) -> bool {
+        self.is_wal_backup_required(num_computes)
+            // FIXME: add tracking of relevant pageservers and check them here individually,
+            // otherwise migration won't work (we suspend too early).
+            || self.sk.state.inmem.remote_consistent_lsn < self.sk.state.inmem.commit_lsn
+    }
+
+    /// Mark timeline active/inactive and return whether s3 offloading requires
+    /// start/stop action. If timeline is deactivated, control file is persisted
+    /// as maintenance task does that only for active timelines.
+    async fn update_status(&mut self, num_computes: usize, ttid: TenantTimelineId) -> bool {
+        let is_active = self.is_active(num_computes);
+        if self.active != is_active {
+            info!(
+                "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
+                ttid,
+                is_active,
+                self.sk.state.inmem.remote_consistent_lsn,
+                self.sk.state.inmem.commit_lsn
+            );
+            if !is_active {
+                if let Err(e) = self.sk.state.flush().await {
+                    warn!("control file save in update_status failed: {:?}", e);
+                }
+            }
+        }
+        self.active = is_active;
+        self.is_wal_backup_action_pending(num_computes)
+    }
+
+    /// Should we run s3 offloading in current state?
+    fn is_wal_backup_required(&self, num_computes: usize) -> bool {
+        let seg_size = self.get_wal_seg_size();
+        num_computes > 0 ||
+        // Currently only the whole segment is offloaded, so compare segment numbers.
+            (self.sk.state.inmem.commit_lsn.segment_number(seg_size) >
+             self.sk.state.inmem.backup_lsn.segment_number(seg_size))
+    }
+
+    /// Is current state of s3 offloading is not what it ought to be?
+    fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool {
+        let res = self.wal_backup_active != self.is_wal_backup_required(num_computes);
+        if res {
+            let action_pending = if self.is_wal_backup_required(num_computes) {
+                "start"
+            } else {
+                "stop"
+            };
+            trace!(
+                "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
+                self.sk.state.timeline_id, action_pending, num_computes, self.sk.state.inmem.commit_lsn, self.sk.state.inmem.backup_lsn
+            );
+        }
+        res
+    }
+
+    /// Returns whether s3 offloading is required and sets current status as
+    /// matching.
+    fn wal_backup_attend(&mut self, num_computes: usize) -> bool {
+        self.wal_backup_active = self.is_wal_backup_required(num_computes);
+        self.wal_backup_active
+    }
+
    fn get_wal_seg_size(&self) -> usize {
        self.sk.state.server.wal_seg_size as usize
    }
@@ -235,7 +248,6 @@ impl SharedState {
        &self,
        ttid: &TenantTimelineId,
        conf: &SafeKeeperConf,
-        standby_apply_lsn: Lsn,
    ) -> SafekeeperTimelineInfo {
        SafekeeperTimelineInfo {
            safekeeper_id: conf.my_id.0,
@@ -258,14 +270,13 @@ impl SharedState {
            backup_lsn: self.sk.state.inmem.backup_lsn.0,
            local_start_lsn: self.sk.state.local_start_lsn.0,
            availability_zone: conf.availability_zone.clone(),
-            standby_horizon: standby_apply_lsn.0,
        }
    }

    /// Get our latest view of alive peers status on the timeline.
    /// We pass our own info through the broker as well, so when we don't have connection
    /// to the broker returned vec is empty.
-    pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
+    fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
        let now = Instant::now();
        self.peers_info
            .0
@@ -281,13 +292,18 @@ impl SharedState {
    /// offloading.
    /// While it is safe to use inmem values for determining horizon,
    /// we use persistent to make possible normal states less surprising.
-    fn get_horizon_segno(&self, extra_horizon_lsn: Option<Lsn>) -> XLogSegNo {
+    fn get_horizon_segno(
+        &self,
+        wal_backup_enabled: bool,
+        extra_horizon_lsn: Option<Lsn>,
+    ) -> XLogSegNo {
        let state = &self.sk.state;

        use std::cmp::min;
        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
-        // we don't want to remove WAL that is not yet offloaded to s3
-        horizon_lsn = min(horizon_lsn, state.backup_lsn);
+        if wal_backup_enabled {
+            horizon_lsn = min(horizon_lsn, state.backup_lsn);
+        }
        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
        }
@@ -328,6 +344,11 @@ impl From<TimelineError> for ApiError {
 pub struct Timeline {
    pub ttid: TenantTimelineId,

+    /// Sending here asks for wal backup launcher attention (start/stop
+    /// offloading). Sending ttid instead of concrete command allows to do
+    /// sending without timeline lock.
+    pub wal_backup_launcher_tx: Sender<TenantTimelineId>,
+
    /// Used to broadcast commit_lsn updates to all background jobs.
    commit_lsn_watch_tx: watch::Sender<Lsn>,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,
@@ -339,19 +360,19 @@ pub struct Timeline {
    term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
    term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,

-    /// Broadcasts shared state updates.
-    shared_state_version_tx: watch::Sender<usize>,
-    shared_state_version_rx: watch::Receiver<usize>,
-
    /// Safekeeper and other state, that should remain consistent and
    /// synchronized with the disk. This is tokio mutex as we write WAL to disk
    /// while holding it, ensuring that consensus checks are in order.
-    mutex: RwLock<SharedState>,
+    mutex: Mutex<SharedState>,
    walsenders: Arc<WalSenders>,
    walreceivers: Arc<WalReceivers>,

-    /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires
-    pub(crate) cancel: CancellationToken,
+    /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal.
+    cancellation_tx: watch::Sender<bool>,
+
+    /// Timeline should not be used after cancellation. Background tasks should
+    /// monitor this channel and stop eventually after receiving `true` from this channel.
+    cancellation_rx: watch::Receiver<bool>,

    /// Directory where timeline state is stored.
    pub timeline_dir: Utf8PathBuf,
@@ -361,15 +382,15 @@ pub struct Timeline {
    /// with different speed.
    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
    walsenders_keep_horizon: bool,
-
-    // timeline_manager controlled state
-    pub(crate) broker_active: AtomicBool,
-    pub(crate) wal_backup_active: AtomicBool,
 }

 impl Timeline {
    /// Load existing timeline from disk.
-    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
+    pub fn load_timeline(
+        conf: &SafeKeeperConf,
+        ttid: TenantTimelineId,
+        wal_backup_launcher_tx: Sender<TenantTimelineId>,
+    ) -> Result<Timeline> {
        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();

        let shared_state = SharedState::restore(conf, &ttid)?;
@@ -379,25 +400,23 @@ impl Timeline {
            shared_state.sk.get_term(),
            shared_state.sk.flush_lsn(),
        )));
-        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
+        let (cancellation_tx, cancellation_rx) = watch::channel(false);

        let walreceivers = WalReceivers::new();
        Ok(Timeline {
            ttid,
+            wal_backup_launcher_tx,
            commit_lsn_watch_tx,
            commit_lsn_watch_rx,
            term_flush_lsn_watch_tx,
            term_flush_lsn_watch_rx,
-            shared_state_version_tx,
-            shared_state_version_rx,
-            mutex: RwLock::new(shared_state),
+            mutex: Mutex::new(shared_state),
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
-            cancel: CancellationToken::default(),
+            cancellation_rx,
+            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
            walsenders_keep_horizon: conf.walsenders_keep_horizon,
-            broker_active: AtomicBool::new(false),
-            wal_backup_active: AtomicBool::new(false),
        })
    }

@@ -405,6 +424,7 @@ impl Timeline {
    pub fn create_empty(
        conf: &SafeKeeperConf,
        ttid: TenantTimelineId,
+        wal_backup_launcher_tx: Sender<TenantTimelineId>,
        server_info: ServerInfo,
        commit_lsn: Lsn,
        local_start_lsn: Lsn,
@@ -412,28 +432,25 @@ impl Timeline {
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
            watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
-        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-
+        let (cancellation_tx, cancellation_rx) = watch::channel(false);
        let state =
            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);

        let walreceivers = WalReceivers::new();
        Ok(Timeline {
            ttid,
+            wal_backup_launcher_tx,
            commit_lsn_watch_tx,
            commit_lsn_watch_rx,
            term_flush_lsn_watch_tx,
            term_flush_lsn_watch_rx,
-            shared_state_version_tx,
-            shared_state_version_rx,
-            mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
+            mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
            walsenders: WalSenders::new(walreceivers.clone()),
            walreceivers,
-            cancel: CancellationToken::default(),
+            cancellation_rx,
+            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
            walsenders_keep_horizon: conf.walsenders_keep_horizon,
-            broker_active: AtomicBool::new(false),
-            wal_backup_active: AtomicBool::new(false),
        })
    }

@@ -444,9 +461,8 @@ impl Timeline {
    /// and state on disk should remain unchanged.
    pub async fn init_new(
        self: &Arc<Timeline>,
-        shared_state: &mut WriteGuardSharedState<'_>,
+        shared_state: &mut MutexGuard<'_, SharedState>,
        conf: &SafeKeeperConf,
-        broker_active_set: Arc<TimelinesSet>,
    ) -> Result<()> {
        match fs::metadata(&self.timeline_dir).await {
            Ok(_) => {
@@ -477,29 +493,16 @@ impl Timeline {

            return Err(e);
        }
-        self.bootstrap(conf, broker_active_set);
+        self.bootstrap(conf);
        Ok(())
    }

-    /// Bootstrap new or existing timeline starting background tasks.
-    pub fn bootstrap(
-        self: &Arc<Timeline>,
-        conf: &SafeKeeperConf,
-        broker_active_set: Arc<TimelinesSet>,
-    ) {
-        // Start manager task which will monitor timeline state and update
-        // background tasks.
-        tokio::spawn(timeline_manager::main_task(
-            self.clone(),
-            conf.clone(),
-            broker_active_set,
-        ));
-
+    /// Bootstrap new or existing timeline starting background stasks.
+    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
        // Start recovery task which always runs on the timeline.
        if conf.peer_recovery_enabled {
            tokio::spawn(recovery_main(self.clone(), conf.clone()));
        }
-        // TODO: migrate to timeline_manager
        if conf.is_wal_backup_enabled() && conf.partial_backup_enabled {
            tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone()));
        }
@@ -512,9 +515,10 @@ impl Timeline {
    /// deletion API endpoint is retriable.
    pub async fn delete(
        &self,
-        shared_state: &mut WriteGuardSharedState<'_>,
+        shared_state: &mut MutexGuard<'_, SharedState>,
        only_local: bool,
-    ) -> Result<bool> {
+    ) -> Result<(bool, bool)> {
+        let was_active = shared_state.active;
        self.cancel(shared_state);

        // TODO: It's better to wait for s3 offloader termination before
@@ -528,14 +532,20 @@ impl Timeline {
            wal_backup::delete_timeline(&self.ttid).await?;
        }
        let dir_existed = delete_dir(&self.timeline_dir).await?;
-        Ok(dir_existed)
+        Ok((dir_existed, was_active))
    }

    /// Cancel timeline to prevent further usage. Background tasks will stop
    /// eventually after receiving cancellation signal.
-    fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) {
+    ///
+    /// Note that we can't notify backup launcher here while holding
+    /// shared_state lock, as this is a potential deadlock: caller is
+    /// responsible for that. Generally we should probably make WAL backup tasks
+    /// to shut down on their own, checking once in a while whether it is the
+    /// time.
+    fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
        info!("timeline {} is cancelled", self.ttid);
-        self.cancel.cancel();
+        let _ = self.cancellation_tx.send(true);
        // Close associated FDs. Nobody will be able to touch timeline data once
        // it is cancelled, so WAL storage won't be opened again.
        shared_state.sk.wal_store.close();
@@ -543,16 +553,44 @@ impl Timeline {

    /// Returns if timeline is cancelled.
    pub fn is_cancelled(&self) -> bool {
-        self.cancel.is_cancelled()
+        *self.cancellation_rx.borrow()
+    }
+
+    /// Returns watch channel which gets value when timeline is cancelled. It is
+    /// guaranteed to have not cancelled value observed (errors otherwise).
+    pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
+        let rx = self.cancellation_rx.clone();
+        if *rx.borrow() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+        Ok(rx)
    }

    /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
-        WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
+    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
+        self.mutex.lock().await
    }

-    pub async fn read_shared_state(&self) -> ReadGuardSharedState {
-        self.mutex.read().await
+    async fn update_status(&self, shared_state: &mut SharedState) -> bool {
+        shared_state
+            .update_status(self.walreceivers.get_num(), self.ttid)
+            .await
+    }
+
+    /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
+    pub async fn update_status_notify(&self) -> Result<()> {
+        if self.is_cancelled() {
+            bail!(TimelineError::Cancelled(self.ttid));
+        }
+        let is_wal_backup_action_pending: bool = {
+            let mut shared_state = self.write_shared_state().await;
+            self.update_status(&mut shared_state).await
+        };
+        if is_wal_backup_action_pending {
+            // Can fail only if channel to a static thread got closed, which is not normal at all.
+            self.wal_backup_launcher_tx.send(self.ttid).await?;
+        }
+        Ok(())
    }

    /// Returns true if walsender should stop sending WAL to pageserver. We
@@ -564,7 +602,7 @@ impl Timeline {
        if self.is_cancelled() {
            return true;
        }
-        let shared_state = self.read_shared_state().await;
+        let shared_state = self.write_shared_state().await;
        if self.walreceivers.get_num() == 0 {
            return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet
            reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn;
@@ -572,9 +610,9 @@ impl Timeline {
        false
    }

-    /// Ensure that current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
-        let ss = self.read_shared_state().await;
+    /// Ensure taht current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
+        let ss = self.write_shared_state().await;
        if ss.sk.state.acceptor_state.term != t {
            bail!(
                "failed to acquire term {}, current term {}",
@@ -585,6 +623,18 @@ impl Timeline {
        Ok(ss)
    }

+    /// Returns whether s3 offloading is required and sets current status as
+    /// matching it.
+    pub async fn wal_backup_attend(&self) -> bool {
+        if self.is_cancelled() {
+            return false;
+        }
+
+        self.write_shared_state()
+            .await
+            .wal_backup_attend(self.walreceivers.get_num())
+    }
+
    /// Returns commit_lsn watch channel.
    pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
        self.commit_lsn_watch_rx.clone()
@@ -595,14 +645,9 @@ impl Timeline {
        self.term_flush_lsn_watch_rx.clone()
    }

-    /// Returns watch channel for SharedState update version.
-    pub fn get_state_version_rx(&self) -> watch::Receiver<usize> {
-        self.shared_state_version_rx.clone()
-    }
-
    /// Pass arrived message to the safekeeper.
    pub async fn process_msg(
-        self: &Arc<Self>,
+        &self,
        msg: &ProposerAcceptorMessage,
    ) -> Result<Option<AcceptorProposerMessage>> {
        if self.is_cancelled() {
@@ -610,36 +655,53 @@ impl Timeline {
        }

        let mut rmsg: Option<AcceptorProposerMessage>;
+        let commit_lsn: Lsn;
+        let term_flush_lsn: TermLsn;
        {
            let mut shared_state = self.write_shared_state().await;
            rmsg = shared_state.sk.process_msg(msg).await?;

            // if this is AppendResponse, fill in proper hot standby feedback.
            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
+                resp.hs_feedback = self.walsenders.get_hotstandby();
            }
+
+            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
+            term_flush_lsn =
+                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
        }
+        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
+        self.commit_lsn_watch_tx.send(commit_lsn)?;
        Ok(rmsg)
    }

    /// Returns wal_seg_size.
    pub async fn get_wal_seg_size(&self) -> usize {
-        self.read_shared_state().await.get_wal_seg_size()
+        self.write_shared_state().await.get_wal_seg_size()
+    }
+
+    /// Returns true only if the timeline is loaded and active.
+    pub async fn is_active(&self) -> bool {
+        if self.is_cancelled() {
+            return false;
+        }
+
+        self.write_shared_state().await.active
    }

    /// Returns state of the timeline.
    pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) {
-        let state = self.read_shared_state().await;
+        let state = self.write_shared_state().await;
        (state.sk.state.inmem.clone(), state.sk.state.clone())
    }

    /// Returns latest backup_lsn.
    pub async fn get_wal_backup_lsn(&self) -> Lsn {
-        self.read_shared_state().await.sk.state.inmem.backup_lsn
+        self.write_shared_state().await.sk.state.inmem.backup_lsn
    }

    /// Sets backup_lsn to the given value.
-    pub async fn set_wal_backup_lsn(self: &Arc<Self>, backup_lsn: Lsn) -> Result<()> {
+    pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
        if self.is_cancelled() {
            bail!(TimelineError::Cancelled(self.ttid));
        }
@@ -653,34 +715,39 @@ impl Timeline {

    /// Get safekeeper info for broadcasting to broker and other peers.
    pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
-        let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
-        let shared_state = self.read_shared_state().await;
-        shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
+        let shared_state = self.write_shared_state().await;
+        shared_state.get_safekeeper_info(&self.ttid, conf)
    }

    /// Update timeline state with peer safekeeper data.
-    pub async fn record_safekeeper_info(
-        self: &Arc<Self>,
-        sk_info: SafekeeperTimelineInfo,
-    ) -> Result<()> {
+    pub async fn record_safekeeper_info(&self, sk_info: SafekeeperTimelineInfo) -> Result<()> {
+        let is_wal_backup_action_pending: bool;
+        let commit_lsn: Lsn;
        {
            let mut shared_state = self.write_shared_state().await;
            shared_state.sk.record_safekeeper_info(&sk_info).await?;
            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
            shared_state.peers_info.upsert(&peer_info);
+            is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
+            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
+        }
+        self.commit_lsn_watch_tx.send(commit_lsn)?;
+        // Wake up wal backup launcher, if it is time to stop the offloading.
+        if is_wal_backup_action_pending {
+            self.wal_backup_launcher_tx.send(self.ttid).await?;
        }
        Ok(())
    }

    /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(self: &Arc<Self>, candidate: Lsn) {
+    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
        let mut shared_state = self.write_shared_state().await;
        shared_state.sk.state.inmem.remote_consistent_lsn =
            max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate);
    }

    pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
-        let shared_state = self.read_shared_state().await;
+        let shared_state = self.write_shared_state().await;
        shared_state.get_peers(conf.heartbeat_timeout)
    }

@@ -702,7 +769,7 @@ impl Timeline {
    /// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
    /// Thus we don't try to predict it here.
    pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
-        let ss = self.read_shared_state().await;
+        let ss = self.write_shared_state().await;
        let term = ss.sk.state.acceptor_state.term;
        let last_log_term = ss.sk.get_epoch();
        let flush_lsn = ss.sk.flush_lsn();
@@ -773,12 +840,12 @@ impl Timeline {

    /// Returns flush_lsn.
    pub async fn get_flush_lsn(&self) -> Lsn {
-        self.read_shared_state().await.sk.wal_store.flush_lsn()
+        self.write_shared_state().await.sk.wal_store.flush_lsn()
    }

    /// Delete WAL segments from disk that are no longer needed. This is determined
    /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
-    pub async fn remove_old_wal(self: &Arc<Self>) -> Result<()> {
+    pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
        if self.is_cancelled() {
            bail!(TimelineError::Cancelled(self.ttid));
        }
@@ -794,8 +861,9 @@ impl Timeline {

        let horizon_segno: XLogSegNo;
        let remover = {
-            let shared_state = self.read_shared_state().await;
-            horizon_segno = shared_state.get_horizon_segno(replication_horizon_lsn);
+            let shared_state = self.write_shared_state().await;
+            horizon_segno =
+                shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                return Ok(()); // nothing to do
            }
@@ -809,11 +877,7 @@ impl Timeline {

        // update last_removed_segno
        let mut shared_state = self.write_shared_state().await;
-        if shared_state.last_removed_segno != horizon_segno {
-            shared_state.last_removed_segno = horizon_segno;
-        } else {
-            shared_state.skip_update = true;
-        }
+        shared_state.last_removed_segno = horizon_segno;
        Ok(())
    }

@@ -821,40 +885,46 @@ impl Timeline {
    /// passed after the last save. This helps to keep remote_consistent_lsn up
    /// to date so that storage nodes restart doesn't cause many pageserver ->
    /// safekeeper reconnections.
-    pub async fn maybe_persist_control_file(self: &Arc<Self>) -> Result<()> {
-        let mut guard = self.write_shared_state().await;
-        let changed = guard.sk.maybe_persist_inmem_control_file().await?;
-        guard.skip_update = !changed;
-        Ok(())
+    pub async fn maybe_persist_control_file(&self) -> Result<()> {
+        self.write_shared_state()
+            .await
+            .sk
+            .maybe_persist_inmem_control_file()
+            .await
    }

-    /// Gather timeline data for metrics.
+    /// Gather timeline data for metrics. If the timeline is not active, returns
+    /// None, we do not collect these.
    pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
        if self.is_cancelled() {
            return None;
        }

        let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
-        let state = self.read_shared_state().await;
-        Some(FullTimelineInfo {
-            ttid: self.ttid,
-            ps_feedback_count,
-            last_ps_feedback,
-            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
-            timeline_is_active: self.broker_active.load(Ordering::Relaxed),
-            num_computes: self.walreceivers.get_num() as u32,
-            last_removed_segno: state.last_removed_segno,
-            epoch_start_lsn: state.sk.epoch_start_lsn,
-            mem_state: state.sk.state.inmem.clone(),
-            persisted_state: state.sk.state.clone(),
-            flush_lsn: state.sk.wal_store.flush_lsn(),
-            wal_storage: state.sk.wal_store.get_metrics(),
-        })
+        let state = self.write_shared_state().await;
+        if state.active {
+            Some(FullTimelineInfo {
+                ttid: self.ttid,
+                ps_feedback_count,
+                last_ps_feedback,
+                wal_backup_active: state.wal_backup_active,
+                timeline_is_active: state.active,
+                num_computes: self.walreceivers.get_num() as u32,
+                last_removed_segno: state.last_removed_segno,
+                epoch_start_lsn: state.sk.epoch_start_lsn,
+                mem_state: state.sk.state.inmem.clone(),
+                persisted_state: state.sk.state.clone(),
+                flush_lsn: state.sk.wal_store.flush_lsn(),
+                wal_storage: state.sk.wal_store.get_metrics(),
+            })
+        } else {
+            None
+        }
    }

    /// Returns in-memory timeline state to build a full debug dump.
    pub async fn memory_dump(&self) -> debug_dump::Memory {
-        let state = self.read_shared_state().await;
+        let state = self.write_shared_state().await;

        let (write_lsn, write_record_lsn, flush_lsn, file_open) =
            state.sk.wal_store.internal_state();
@@ -863,8 +933,8 @@ impl Timeline {
            is_cancelled: self.is_cancelled(),
            peers_info_len: state.peers_info.0.len(),
            walsenders: self.walsenders.get_all(),
-            wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
-            active: self.broker_active.load(Ordering::Relaxed),
+            wal_backup_active: state.wal_backup_active,
+            active: state.active,
            num_computes: self.walreceivers.get_num() as u32,
            last_removed_segno: state.last_removed_segno,
            epoch_start_lsn: state.sk.epoch_start_lsn,
@@ -878,7 +948,7 @@ impl Timeline {

    /// Apply a function to the control file state and persist it.
    pub async fn map_control_file<T>(
-        self: &Arc<Self>,
+        &self,
        f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
    ) -> Result<T> {
        let mut state = self.write_shared_state().await;
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -1,145 +0,0 @@
-//! The timeline manager task is responsible for managing the timeline's background tasks.
-//! It is spawned alongside each timeline and exits when the timeline is deleted.
-//! It watches for changes in the timeline state and decides when to spawn or kill background tasks.
-//! It also can manage some reactive state, like should the timeline be active for broker pushes or not.
-
-use std::{sync::Arc, time::Duration};
-
-use tracing::{info, instrument, warn};
-use utils::lsn::Lsn;
-
-use crate::{
-    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
-    timeline::{PeerInfo, ReadGuardSharedState, Timeline},
-    timelines_set::TimelinesSet,
-    wal_backup::{self, WalBackupTaskHandle},
-    SafeKeeperConf,
-};
-
-pub struct StateSnapshot {
-    pub commit_lsn: Lsn,
-    pub backup_lsn: Lsn,
-    pub remote_consistent_lsn: Lsn,
-    pub peers: Vec<PeerInfo>,
-}
-
-impl StateSnapshot {
-    /// Create a new snapshot of the timeline state.
-    fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self {
-        Self {
-            commit_lsn: read_guard.sk.state.inmem.commit_lsn,
-            backup_lsn: read_guard.sk.state.inmem.backup_lsn,
-            remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn,
-            peers: read_guard.get_peers(heartbeat_timeout),
-        }
-    }
-}
-
-/// Control how often the manager task should wake up to check updates.
-/// There is no need to check for updates more often than this.
-const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
-
-/// This task gets spawned alongside each timeline and is responsible for managing the timeline's
-/// background tasks.
-#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))]
-pub async fn main_task(
-    tli: Arc<Timeline>,
-    conf: SafeKeeperConf,
-    broker_active_set: Arc<TimelinesSet>,
-) {
-    scopeguard::defer! {
-        if tli.is_cancelled() {
-            info!("manager task finished");
-        } else {
-            warn!("manager task finished prematurely");
-        }
-    };
-
-    // sets whether timeline is active for broker pushes or not
-    let mut tli_broker_active = broker_active_set.guard(tli.clone());
-
-    let ttid = tli.ttid;
-    let wal_seg_size = tli.get_wal_seg_size().await;
-    let heartbeat_timeout = conf.heartbeat_timeout;
-
-    let mut state_version_rx = tli.get_state_version_rx();
-
-    let walreceivers = tli.get_walreceivers();
-    let mut num_computes_rx = walreceivers.get_num_rx();
-
-    // list of background tasks
-    let mut backup_task: Option<WalBackupTaskHandle> = None;
-
-    let last_state = 'outer: loop {
-        MANAGER_ITERATIONS_TOTAL.inc();
-
-        let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout);
-        let num_computes = *num_computes_rx.borrow();
-
-        let is_wal_backup_required =
-            wal_backup::is_wal_backup_required(wal_seg_size, num_computes, &state_snapshot);
-
-        if conf.is_wal_backup_enabled() {
-            wal_backup::update_task(
-                &conf,
-                ttid,
-                is_wal_backup_required,
-                &state_snapshot,
-                &mut backup_task,
-            )
-            .await;
-        }
-
-        let is_active = is_wal_backup_required
-            || num_computes > 0
-            || state_snapshot.remote_consistent_lsn < state_snapshot.commit_lsn;
-
-        // update the broker timeline set
-        if tli_broker_active.set(is_active) {
-            // write log if state has changed
-            info!(
-                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                is_active, state_snapshot.remote_consistent_lsn, state_snapshot.commit_lsn,
-            );
-
-            MANAGER_ACTIVE_CHANGES.inc();
-
-            if !is_active {
-                // TODO: maybe use tokio::spawn?
-                if let Err(e) = tli.maybe_persist_control_file().await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
-        }
-
-        // update the state in Arc<Timeline>
-        tli.wal_backup_active
-            .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed);
-        tli.broker_active
-            .store(is_active, std::sync::atomic::Ordering::Relaxed);
-
-        // wait until something changes. tx channels are stored under Arc, so they will not be
-        // dropped until the manager task is finished.
-        tokio::select! {
-            _ = tli.cancel.cancelled() => {
-                // timeline was deleted
-                break 'outer state_snapshot;
-            }
-            _ = async {
-                // don't wake up on every state change, but at most every REFRESH_INTERVAL
-                tokio::time::sleep(REFRESH_INTERVAL).await;
-                let _ = state_version_rx.changed().await;
-            } => {
-                // state was updated
-            }
-            _ = num_computes_rx.changed() => {
-                // number of connected computes was updated
-            }
-        }
-    };
-
-    // shutdown background tasks
-    if conf.is_wal_backup_enabled() {
-        wal_backup::update_task(&conf, ttid, false, &last_state, &mut backup_task).await;
-    }
-}
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -4,7 +4,6 @@

 use crate::safekeeper::ServerInfo;
 use crate::timeline::{Timeline, TimelineError};
-use crate::timelines_set::TimelinesSet;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -12,16 +11,16 @@ use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
-use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
+use tokio::sync::mpsc::Sender;
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;

 struct GlobalTimelinesState {
    timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
+    wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
    conf: Option<SafeKeeperConf>,
-    broker_active_set: Arc<TimelinesSet>,
    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
 }

@@ -37,8 +36,11 @@ impl GlobalTimelinesState {
    }

    /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
-        (self.get_conf().clone(), self.broker_active_set.clone())
+    fn get_dependencies(&self) -> (SafeKeeperConf, Sender<TenantTimelineId>) {
+        (
+            self.get_conf().clone(),
+            self.wal_backup_launcher_tx.as_ref().unwrap().clone(),
+        )
    }

    /// Insert timeline into the map. Returns error if timeline with the same id already exists.
@@ -63,8 +65,8 @@ impl GlobalTimelinesState {
 static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
    Mutex::new(GlobalTimelinesState {
        timelines: HashMap::new(),
+        wal_backup_launcher_tx: None,
        conf: None,
-        broker_active_set: Arc::new(TimelinesSet::default()),
        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
    })
 });
@@ -74,11 +76,16 @@ pub struct GlobalTimelines;

 impl GlobalTimelines {
    /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub async fn init(conf: SafeKeeperConf) -> Result<()> {
+    pub async fn init(
+        conf: SafeKeeperConf,
+        wal_backup_launcher_tx: Sender<TenantTimelineId>,
+    ) -> Result<()> {
        // clippy isn't smart enough to understand that drop(state) releases the
        // lock, so use explicit block
        let tenants_dir = {
            let mut state = TIMELINES_STATE.lock().unwrap();
+            assert!(state.wal_backup_launcher_tx.is_none());
+            state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
            state.conf = Some(conf);

            // Iterate through all directories and load tenants for all directories
@@ -122,9 +129,12 @@ impl GlobalTimelines {
    /// this function is called during init when nothing else is running, so
    /// this is fine.
    async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
-        let (conf, broker_active_set) = {
+        let (conf, wal_backup_launcher_tx) = {
            let state = TIMELINES_STATE.lock().unwrap();
-            state.get_dependencies()
+            (
+                state.get_conf().clone(),
+                state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
+            )
        };

        let timelines_dir = conf.tenant_dir(&tenant_id);
@@ -137,7 +147,7 @@ impl GlobalTimelines {
                        TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(&conf, ttid) {
+                        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
                            Ok(timeline) => {
                                let tli = Arc::new(timeline);
                                TIMELINES_STATE
@@ -145,7 +155,8 @@ impl GlobalTimelines {
                                    .unwrap()
                                    .timelines
                                    .insert(ttid, tli.clone());
-                                tli.bootstrap(&conf, broker_active_set.clone());
+                                tli.bootstrap(&conf);
+                                tli.update_status_notify().await.unwrap();
                            }
                            // If we can't load a timeline, it's most likely because of a corrupted
                            // directory. We will log an error and won't allow to delete/recreate
@@ -178,9 +189,9 @@ impl GlobalTimelines {
        _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
        ttid: TenantTimelineId,
    ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
+        let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();

-        match Timeline::load_timeline(&conf, ttid) {
+        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
            Ok(timeline) => {
                let tli = Arc::new(timeline);

@@ -191,7 +202,7 @@ impl GlobalTimelines {
                    .timelines
                    .insert(ttid, tli.clone());

-                tli.bootstrap(&conf, broker_active_set);
+                tli.bootstrap(&conf);

                Ok(tli)
            }
@@ -210,10 +221,6 @@ impl GlobalTimelines {
        TIMELINES_STATE.lock().unwrap().get_conf().clone()
    }

-    pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
-        TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
-    }
-
    /// Create a new timeline with the given id. If the timeline already exists, returns
    /// an existing timeline.
    pub async fn create(
@@ -222,7 +229,7 @@ impl GlobalTimelines {
        commit_lsn: Lsn,
        local_start_lsn: Lsn,
    ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set) = {
+        let (conf, wal_backup_launcher_tx) = {
            let state = TIMELINES_STATE.lock().unwrap();
            if let Ok(timeline) = state.get(&ttid) {
                // Timeline already exists, return it.
@@ -236,6 +243,7 @@ impl GlobalTimelines {
        let timeline = Arc::new(Timeline::create_empty(
            &conf,
            ttid,
+            wal_backup_launcher_tx,
            server_info,
            commit_lsn,
            local_start_lsn,
@@ -256,10 +264,7 @@ impl GlobalTimelines {
            // Write the new timeline to the disk and start background workers.
            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
            // and the state on disk should remain unchanged.
-            if let Err(e) = timeline
-                .init_new(&mut shared_state, &conf, broker_active_set)
-                .await
-            {
+            if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
                // Note: the most likely reason for init failure is that the timeline
                // directory already exists on disk. This happens when timeline is corrupted
                // and wasn't loaded from disk on startup because of that. We want to preserve
@@ -276,6 +281,8 @@ impl GlobalTimelines {
            // We are done with bootstrap, release the lock, return the timeline.
            // {} block forces release before .await
        }
+        timeline.update_status_notify().await?;
+        timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
        Ok(timeline)
    }

@@ -328,13 +335,12 @@ impl GlobalTimelines {
        let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
        match tli_res {
            Ok(timeline) => {
-                let was_active = timeline.broker_active.load(Ordering::Relaxed);
-
                // Take a lock and finish the deletion holding this mutex.
                let mut shared_state = timeline.write_shared_state().await;

                info!("deleting timeline {}, only_local={}", ttid, only_local);
-                let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
+                let (dir_existed, was_active) =
+                    timeline.delete(&mut shared_state, only_local).await?;

                // Remove timeline from the map.
                // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
@@ -343,7 +349,7 @@ impl GlobalTimelines {

                Ok(TimelineDeleteForceResult {
                    dir_existed,
-                    was_active, // TODO: we probably should remove this field
+                    was_active,
                })
            }
            Err(_) => {
--- a/safekeeper/src/timelines_set.rs
+++ b/safekeeper/src/timelines_set.rs
@@ -1,90 +0,0 @@
-use std::{collections::HashMap, sync::Arc};
-
-use utils::id::TenantTimelineId;
-
-use crate::timeline::Timeline;
-
-/// Set of timelines, supports operations:
-/// - add timeline
-/// - remove timeline
-/// - clone the set
-///
-/// Usually used for keeping subset of timelines. For example active timelines that require broker push.
-pub struct TimelinesSet {
-    timelines: std::sync::Mutex<HashMap<TenantTimelineId, Arc<Timeline>>>,
-}
-
-impl Default for TimelinesSet {
-    fn default() -> Self {
-        Self {
-            timelines: std::sync::Mutex::new(HashMap::new()),
-        }
-    }
-}
-
-impl TimelinesSet {
-    pub fn insert(&self, tli: Arc<Timeline>) {
-        self.timelines.lock().unwrap().insert(tli.ttid, tli);
-    }
-
-    pub fn delete(&self, ttid: &TenantTimelineId) {
-        self.timelines.lock().unwrap().remove(ttid);
-    }
-
-    /// If present is true, adds timeline to the set, otherwise removes it.
-    pub fn set_present(&self, tli: Arc<Timeline>, present: bool) {
-        if present {
-            self.insert(tli);
-        } else {
-            self.delete(&tli.ttid);
-        }
-    }
-
-    pub fn is_present(&self, ttid: &TenantTimelineId) -> bool {
-        self.timelines.lock().unwrap().contains_key(ttid)
-    }
-
-    /// Returns all timelines in the set.
-    pub fn get_all(&self) -> Vec<Arc<Timeline>> {
-        self.timelines.lock().unwrap().values().cloned().collect()
-    }
-
-    /// Returns a timeline guard for easy presence control.
-    pub fn guard(self: &Arc<Self>, tli: Arc<Timeline>) -> TimelineSetGuard {
-        let is_present = self.is_present(&tli.ttid);
-        TimelineSetGuard {
-            timelines_set: self.clone(),
-            tli,
-            is_present,
-        }
-    }
-}
-
-/// Guard is used to add or remove timeline from the set.
-/// If the timeline present in set, it will be removed from it on drop.
-/// Note: do not use more than one guard for the same timeline, it caches the presence state.
-/// It is designed to be used in the manager task only.
-pub struct TimelineSetGuard {
-    timelines_set: Arc<TimelinesSet>,
-    tli: Arc<Timeline>,
-    is_present: bool,
-}
-
-impl TimelineSetGuard {
-    /// Returns true if the state was changed.
-    pub fn set(&mut self, present: bool) -> bool {
-        if present == self.is_present {
-            return false;
-        }
-        self.is_present = present;
-        self.timelines_set.set_present(self.tli.clone(), present);
-        true
-    }
-}
-
-impl Drop for TimelineSetGuard {
-    fn drop(&mut self) {
-        // remove timeline from the map on drop
-        self.timelines_set.delete(&self.tli.ttid);
-    }
-}
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -9,7 +9,7 @@ use utils::backoff;
 use utils::id::NodeId;

 use std::cmp::min;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -29,10 +29,9 @@ use tracing::*;

 use utils::{id::TenantTimelineId, lsn::Lsn};

-use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
+use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS};
 use crate::timeline::{PeerInfo, Timeline};
-use crate::timeline_manager::StateSnapshot;
-use crate::{GlobalTimelines, SafeKeeperConf, WAL_BACKUP_RUNTIME};
+use crate::{GlobalTimelines, SafeKeeperConf};

 use once_cell::sync::OnceCell;

@@ -42,84 +41,35 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 const BUFFER_SIZE: usize = 32 * 1024;

-pub struct WalBackupTaskHandle {
+/// Check whether wal backup is required for timeline. If yes, mark that launcher is
+/// aware of current status and return the timeline.
+async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
+    match GlobalTimelines::get(ttid).ok() {
+        Some(tli) => {
+            tli.wal_backup_attend().await;
+            Some(tli)
+        }
+        None => None,
+    }
+}
+
+struct WalBackupTaskHandle {
    shutdown_tx: Sender<()>,
    handle: JoinHandle<()>,
 }

-/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity?
-pub fn is_wal_backup_required(
-    wal_seg_size: usize,
-    num_computes: usize,
-    state: &StateSnapshot,
-) -> bool {
-    num_computes > 0 ||
-    // Currently only the whole segment is offloaded, so compare segment numbers.
-    (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size))
+struct WalBackupTimelineEntry {
+    timeline: Arc<Timeline>,
+    handle: Option<WalBackupTaskHandle>,
 }

-/// Based on peer information determine which safekeeper should offload; if it
-/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
-/// is running, kill it.
-pub async fn update_task(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    need_backup: bool,
-    state: &StateSnapshot,
-    entry: &mut Option<WalBackupTaskHandle>,
-) {
-    let (offloader, election_dbg_str) =
-        determine_offloader(&state.peers, state.backup_lsn, ttid, conf);
-    let elected_me = Some(conf.my_id) == offloader;
-
-    let should_task_run = need_backup && elected_me;
-
-    // start or stop the task
-    if should_task_run != (entry.is_some()) {
-        if should_task_run {
-            info!("elected for backup: {}", election_dbg_str);
-
-            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
-            let timeline_dir = conf.timeline_dir(&ttid);
-
-            let async_task = backup_task_main(
-                ttid,
-                timeline_dir,
-                conf.workdir.clone(),
-                conf.backup_parallel_jobs,
-                shutdown_rx,
-            );
-
-            let handle = if conf.current_thread_runtime {
-                tokio::spawn(async_task)
-            } else {
-                WAL_BACKUP_RUNTIME.spawn(async_task)
-            };
-
-            *entry = Some(WalBackupTaskHandle {
-                shutdown_tx,
-                handle,
-            });
-        } else {
-            if !need_backup {
-                // don't need backup at all
-                info!("stepping down from backup, need_backup={}", need_backup);
-            } else {
-                // someone else has been elected
-                info!("stepping down from backup: {}", election_dbg_str);
-            }
-            shut_down_task(entry).await;
-        }
-    }
-}
-
-async fn shut_down_task(entry: &mut Option<WalBackupTaskHandle>) {
-    if let Some(wb_handle) = entry.take() {
+async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) {
+    if let Some(wb_handle) = entry.handle.take() {
        // Tell the task to shutdown. Error means task exited earlier, that's ok.
        let _ = wb_handle.shutdown_tx.send(()).await;
        // Await the task itself. TODO: restart panicked tasks earlier.
        if let Err(e) = wb_handle.handle.await {
-            warn!("WAL backup task panicked: {}", e);
+            warn!("WAL backup task for {} panicked: {}", ttid, e);
        }
    }
 }
@@ -176,6 +126,49 @@ fn determine_offloader(
    }
 }

+/// Based on peer information determine which safekeeper should offload; if it
+/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
+/// is running, kill it.
+async fn update_task(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    entry: &mut WalBackupTimelineEntry,
+) {
+    let alive_peers = entry.timeline.get_peers(conf).await;
+    let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
+    let (offloader, election_dbg_str) =
+        determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
+    let elected_me = Some(conf.my_id) == offloader;
+
+    if elected_me != (entry.handle.is_some()) {
+        if elected_me {
+            info!("elected for backup: {}", election_dbg_str);
+
+            let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
+            let timeline_dir = conf.timeline_dir(&ttid);
+
+            let handle = tokio::spawn(
+                backup_task_main(
+                    ttid,
+                    timeline_dir,
+                    conf.workdir.clone(),
+                    conf.backup_parallel_jobs,
+                    shutdown_rx,
+                )
+                .in_current_span(),
+            );
+
+            entry.handle = Some(WalBackupTaskHandle {
+                shutdown_tx,
+                handle,
+            });
+        } else {
+            info!("stepping down from backup: {}", election_dbg_str);
+            shut_down_task(ttid, entry).await;
+        }
+    }
+}
+
 static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();

 // Storage must be configured and initialized when this is called.
@@ -197,6 +190,67 @@ pub fn init_remote_storage(conf: &SafeKeeperConf) {
    });
 }

+const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
+
+/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
+/// tasks. Having this in separate task simplifies locking, allows to reap
+/// panics and separate elections from offloading itself.
+pub async fn wal_backup_launcher_task_main(
+    conf: SafeKeeperConf,
+    mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
+) -> anyhow::Result<()> {
+    info!(
+        "WAL backup launcher started, remote config {:?}",
+        conf.remote_storage
+    );
+
+    // Presence in this map means launcher is aware s3 offloading is needed for
+    // the timeline, but task is started only if it makes sense for to offload
+    // from this safekeeper.
+    let mut tasks: HashMap<TenantTimelineId, WalBackupTimelineEntry> = HashMap::new();
+
+    let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC));
+    loop {
+        tokio::select! {
+            ttid = wal_backup_launcher_rx.recv() => {
+                // channel is never expected to get closed
+                let ttid = ttid.unwrap();
+                if !conf.is_wal_backup_enabled() {
+                    continue; /* just drain the channel and do nothing */
+                }
+                async {
+                    let timeline = is_wal_backup_required(ttid).await;
+                    // do we need to do anything at all?
+                    if timeline.is_some() != tasks.contains_key(&ttid) {
+                        if let Some(timeline) = timeline {
+                            // need to start the task
+                            let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry {
+                                timeline,
+                                handle: None,
+                            });
+                            update_task(&conf, ttid, entry).await;
+                        } else {
+                            // need to stop the task
+                            info!("stopping WAL backup task");
+                            let mut entry = tasks.remove(&ttid).unwrap();
+                            shut_down_task(ttid, &mut entry).await;
+                        }
+                    }
+                }.instrument(info_span!("WAL backup", ttid = %ttid)).await;
+            }
+            // For each timeline needing offloading, check if this safekeeper
+            // should do the job and start/stop the task accordingly.
+            _ = ticker.tick() => {
+                for (ttid, entry) in tasks.iter_mut() {
+                    update_task(&conf, *ttid, entry)
+                        .instrument(info_span!("WAL backup", ttid = %ttid))
+                        .await;
+                }
+            }
+        }
+    }
+}
+
 struct WalBackupTask {
    timeline: Arc<Timeline>,
    timeline_dir: Utf8PathBuf,
@@ -207,7 +261,6 @@ struct WalBackupTask {
 }

 /// Offload single timeline.
-#[instrument(name = "WAL backup", skip_all, fields(ttid = %ttid))]
 async fn backup_task_main(
    ttid: TenantTimelineId,
    timeline_dir: Utf8PathBuf,
@@ -215,8 +268,6 @@ async fn backup_task_main(
    parallel_jobs: usize,
    mut shutdown_rx: Receiver<()>,
 ) {
-    let _guard = WAL_BACKUP_TASKS.guard();
-
    info!("started");
    let res = GlobalTimelines::get(ttid);
    if let Err(e) = res {
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -277,6 +277,14 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;

+    let mut cancellation_rx = match tli.get_cancellation_rx() {
+        Ok(rx) => rx,
+        Err(_) => {
+            info!("timeline canceled during task start");
+            return;
+        }
+    };
+
    // sleep for random time to avoid thundering herd
    {
        let randf64 = rand::thread_rng().gen_range(0.0..1.0);
@@ -319,7 +327,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
                && flush_lsn_rx.borrow().term == seg.term
            {
                tokio::select! {
-                    _ = backup.tli.cancel.cancelled() => {
+                    _ = cancellation_rx.changed() => {
                        info!("timeline canceled");
                        return;
                    }
@@ -332,7 +340,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
        // if we don't have any data and zero LSNs, wait for something
        while flush_lsn_rx.borrow().lsn == Lsn(0) {
            tokio::select! {
-                _ = backup.tli.cancel.cancelled() => {
+                _ = cancellation_rx.changed() => {
                    info!("timeline canceled");
                    return;
                }
@@ -349,7 +357,7 @@ pub async fn main_task(tli: Arc<Timeline>, conf: SafeKeeperConf) {
        // waiting until timeout expires OR segno changes
        'inner: loop {
            tokio::select! {
-                _ = backup.tli.cancel.cancelled() => {
+                _ = cancellation_rx.changed() => {
                    info!("timeline canceled");
                    return;
                }
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -147,7 +147,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
                local_start_lsn: 0,
                availability_zone: None,
-                standby_horizon: 0,
            };
            counter += 1;
            yield info;
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -42,7 +42,6 @@ message SafekeeperTimelineInfo {
    uint64 remote_consistent_lsn = 7;
    uint64 peer_horizon_lsn = 8;
    uint64 local_start_lsn = 9;
-    uint64 standby_horizon = 14;
    // A connection string to use for WAL receiving.
    string safekeeper_connstr = 10;
    // HTTP endpoint connection string
@@ -106,6 +105,4 @@ message SafekeeperDiscoveryResponse {
    string safekeeper_connstr = 4;
    // Availability zone of a safekeeper.
    optional string availability_zone = 5;
-    // Replica apply LSN
-    uint64 standby_horizon = 6;
 }
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -736,7 +736,6 @@ mod tests {
            http_connstr: "neon-1-sk-1.local:7677".to_owned(),
            local_start_lsn: 0,
            availability_zone: None,
-            standby_horizon: 0,
        })
    }

--- a/Show More
+++ b/Show More