Increase partial backup timeout to 3 hours

2026-02-02 10:10:37 +00:00 · 2024-05-13 16:57:31 +02:00
198 changed files with 4291 additions and 10246 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,7 +17,6 @@
 !libs/
 !neon_local/
 !pageserver/
-!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,7 +5,6 @@ self-hosted-runner:
    - large
    - large-arm64
    - small
-    - small-arm64
    - us-east-2
 config-variables:
  - REMOTE_STORAGE_AZURE_CONTAINER
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -3,13 +3,13 @@ description: 'Create Branch using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  project_id:
-    description: 'ID of the Project to create Branch in'
+    desctiption: 'ID of the Project to create Branch in'
    required: true
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build
 outputs:
  dsn:
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -3,16 +3,16 @@ description: 'Delete Branch using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  project_id:
-    description: 'ID of the Project which should be deleted'
+    desctiption: 'ID of the Project which should be deleted'
    required: true
  branch_id:
-    description: 'ID of the branch to delete'
+    desctiption: 'ID of the branch to delete'
    required: true
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build

 runs:
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -3,22 +3,22 @@ description: 'Create Neon Project using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  region_id:
-    description: 'Region ID, if not set the project will be created in the default region'
+    desctiption: 'Region ID, if not set the project will be created in the default region'
    default: aws-us-east-2
  postgres_version:
-    description: 'Postgres version; default is 15'
-    default: '15'
+    desctiption: 'Postgres version; default is 15'
+    default: 15
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build
  provisioner:
-    description: 'k8s-pod or k8s-neonvm'
+    desctiption: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
  compute_units:
-    description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
+    desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
    default: '[1, 1]'

 outputs:
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -3,13 +3,13 @@ description: 'Delete Neon Project using API'

 inputs:
  api_key:
-    description: 'Neon API key'
+    desctiption: 'Neon API key'
    required: true
  project_id:
-    description: 'ID of the Project to delete'
+    desctiption: 'ID of the Project to delete'
    required: true
  api_host:
-    description: 'Neon API host'
+    desctiption: 'Neon API host'
    default: console-stage.neon.build

 runs:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -432,9 +432,8 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        build_type: [ debug ]
-        pg_version: [ v15 ]
-        pageserver_compaction_algorithm_kind: [ "legacy", "tiered" ]
+        build_type: [ debug, release ]
+        pg_version: [ v14, v15, v16 ]
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -462,9 +461,6 @@ jobs:
          PAGESERVER_GET_VECTORED_IMPL: vectored
          PAGESERVER_GET_IMPL: vectored
          PAGESERVER_VALIDATE_VEC_GET: true
-          PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM: 'kind="${{ matrix.pageserver_compaction_algorithm_kind }}"'
-          # catch the tests that override `tenant_config` as a whole without specifying the compaction algorithm `kind`
-          NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -552,7 +548,7 @@ jobs:

  report-benchmarks-failures:
    needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
+    if: github.ref_name == 'main' && failure()
    runs-on: ubuntu-latest

    steps:
@@ -727,13 +723,9 @@ jobs:
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit

-  neon-image-arch:
+  neon-image:
    needs: [ check-permissions, build-build-tools-image, tag ]
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: [ self-hosted, gen3, large ]

    steps:
      - name: Checkout
@@ -755,6 +747,12 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
      - uses: docker/build-push-action@v5
        with:
          context: .
@@ -766,52 +764,25 @@ jobs:
          push: true
          pull: true
          file: Dockerfile
-          cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max
+          cache-from: type=registry,ref=neondatabase/neon:cache
+          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
          tags: |
-            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{needs.tag.outputs.build-tag}}

      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom

-  neon-image:
-    needs: [ neon-image-arch, tag ]
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Create multi-arch image
-        run: |
-          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Push multi-arch image to ECR
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/neon:${{ needs.tag.outputs.build-tag }}
-
-  compute-node-image-arch:
+  compute-node-image:
    needs: [ check-permissions, build-build-tools-image, tag ]
+    runs-on: [ self-hosted, gen3, large ]
+
    strategy:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}

    steps:
      - name: Checkout
@@ -858,14 +829,15 @@ jobs:
          push: true
          pull: true
          file: Dockerfile.compute-node
-          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
-          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
          tags: |
-            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
-        if: matrix.version == 'v16'
+        if: ${{ matrix.version == 'v16' }}
        uses: docker/build-push-action@v5
        with:
          target: compute-tools-image
@@ -879,57 +851,14 @@ jobs:
          pull: true
          file: Dockerfile.compute-node
          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}

      - name: Remove custom docker config directory
        if: always()
        run: |
          rm -rf .docker-custom

-  compute-node-image:
-    needs: [ compute-node-image-arch, tag ]
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        version: [ v14, v15, v16 ]
-
-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Create multi-arch compute-node image
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
-
-      - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-
-      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v16'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, gen3, large ]
@@ -937,8 +866,11 @@ jobs:
      fail-fast: false
      matrix:
        version: [ v14, v15, v16 ]
+    defaults:
+      run:
+        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.29.3
+      VM_BUILDER_VERSION: v0.28.1

    steps:
      - name: Checkout
@@ -951,48 +883,26 @@ jobs:
          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
          chmod +x vm-builder

-      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-      # The default value is ~/.docker
-      - name: Set custom docker config directory
-        run: |
-          mkdir -p .docker-custom
-          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
      # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and
      # it won't have the proper authentication (written at v0.6.0)
      - name: Pulling compute-node image
        run: |
-          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Build vm image
        run: |
          ./vm-builder \
            -spec=vm-image-spec.yaml \
-            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
-
-      - name: Remove custom docker config directory
-        if: always()
-        run: |
-          rm -rf .docker-custom
+          docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
-    strategy:
-      fail-fast: false
-      matrix:
-        arch: [ x64, arm64 ]
-
-    runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+    runs-on: [ self-hosted, gen3, small ]

    steps:
      - name: Checkout
@@ -1010,7 +920,7 @@ jobs:
      - name: Verify image versions
        shell: bash # ensure no set -e for better error messages
        run: |
-          pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
+          pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")

          echo "Pageserver version string: $pageserver_version"

@@ -1036,48 +946,78 @@ jobs:

  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-latest
-
-    env:
-      VERSIONS: v14 v15 v16
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+    # Don't add if-condition here.
+    # The job should always be run because we have dependant other jobs that shouldn't be skipped

    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - uses: docker/login-action@v3
-        with:
-          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
-          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-      - name: Copy vm-compute-node images to ECR
+      - name: Install Crane & ECR helper
        run: |
-          for version in ${VERSIONS}; do
-            docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \
-                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-          done
+          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Copy vm-compute-node images to Docker Hub
+        run: |
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15
+          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
-        if: github.ref_name == 'main'
+        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
        run: |
-          for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do
-            docker buildx imagetools create -t $repo/neon:latest \
-                                               $repo/neon:${{ needs.tag.outputs.build-tag }}
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

-            docker buildx imagetools create -t $repo/compute-tools:latest \
-                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
+      - name: Push images to production ECR
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        run: |
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest
+          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest

-            for version in ${VERSIONS}; do
-              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
-                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
+      - name: Configure Docker Hub login
+        run: |
+          # ECR Credential Helper & Docker Hub don't work together in config, hence reset
+          echo "" > /github/home/.docker/config.json
+          crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io

-              docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \
-                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
-            done
-          done
+      - name: Push vm-compute-node to Docker Hub
+        run: |
+          crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}}
+          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}
+
+      - name: Push latest tags to Docker Hub
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        run: |
+          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+          crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -136,7 +136,7 @@ jobs:
  check-linux-arm-build:
    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
-    runs-on: [ self-hosted, small-arm64 ]
+    runs-on: [ self-hosted, large-arm64 ]

    env:
      # Use release build only, to have less debug info around
@@ -260,7 +260,7 @@ jobs:
  check-codestyle-rust-arm:
    needs: [ check-permissions, build-build-tools-image ]
    timeout-minutes: 90
-    runs-on: [ self-hosted, small-arm64 ]
+    runs-on: [ self-hosted, large-arm64 ]

    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,7 +53,7 @@ jobs:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Storage & Compute release ${RELEASE_DATE}
+          ## Release ${RELEASE_DATE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -708,7 +708,7 @@ dependencies = [
 "sha1",
 "sync_wrapper",
 "tokio",
- "tokio-tungstenite",
+ "tokio-tungstenite 0.20.0",
 "tower",
 "tower-layer",
 "tower-service",
@@ -979,12 +979,6 @@ version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"

-[[package]]
-name = "bytemuck"
-version = "1.16.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
-
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@@ -1072,9 +1066,9 @@ dependencies = [

 [[package]]
 name = "chrono"
-version = "0.4.38"
+version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
 dependencies = [
 "android-tzdata",
 "iana-time-zone",
@@ -1082,7 +1076,7 @@ dependencies = [
 "num-traits",
 "serde",
 "wasm-bindgen",
- "windows-targets 0.52.4",
+ "windows-targets 0.48.0",
 ]

 [[package]]
@@ -1109,7 +1103,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
 dependencies = [
 "ciborium-io",
- "half 1.8.2",
+ "half",
 ]

 [[package]]
@@ -1239,10 +1233,8 @@ dependencies = [
 "serde_json",
 "signal-hook",
 "tar",
- "thiserror",
 "tokio",
 "tokio-postgres",
- "tokio-stream",
 "tokio-util",
 "toml_edit",
 "tracing",
@@ -1471,21 +1463,26 @@ dependencies = [

 [[package]]
 name = "crossbeam-deque"
-version = "0.8.5"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
+checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
 dependencies = [
+ "cfg-if",
 "crossbeam-epoch",
 "crossbeam-utils",
 ]

 [[package]]
 name = "crossbeam-epoch"
-version = "0.9.18"
+version = "0.9.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
 dependencies = [
+ "autocfg",
+ "cfg-if",
 "crossbeam-utils",
+ "memoffset 0.8.0",
+ "scopeguard",
 ]

 [[package]]
@@ -1599,7 +1596,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
 "cfg-if",
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 "lock_api",
 "once_cell",
 "parking_lot_core 0.9.8",
@@ -2000,27 +1997,6 @@ dependencies = [
 "percent-encoding",
 ]

-[[package]]
-name = "framed-websockets"
-version = "0.1.0"
-source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
-dependencies = [
- "base64 0.21.1",
- "bytemuck",
- "bytes",
- "futures-core",
- "futures-sink",
- "http-body-util",
- "hyper 1.2.0",
- "hyper-util",
- "pin-project",
- "rand 0.8.5",
- "sha1",
- "thiserror",
- "tokio",
- "tokio-util",
-]
-
 [[package]]
 name = "fs2"
 version = "0.4.3"
@@ -2273,17 +2249,6 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"

-[[package]]
-name = "half"
-version = "2.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
-dependencies = [
- "cfg-if",
- "crunchy",
- "num-traits",
-]
-
 [[package]]
 name = "hash32"
 version = "0.3.1"
@@ -2310,9 +2275,9 @@ dependencies = [

 [[package]]
 name = "hashbrown"
-version = "0.14.5"
+version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
 dependencies = [
 "ahash",
 "allocator-api2",
@@ -2320,11 +2285,11 @@ dependencies = [

 [[package]]
 name = "hashlink"
-version = "0.9.1"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
 dependencies = [
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 ]

 [[package]]
@@ -2633,6 +2598,21 @@ dependencies = [
 "tokio-native-tls",
 ]

+[[package]]
+name = "hyper-tungstenite"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a343d17fe7885302ed7252767dc7bb83609a874b6ff581142241ec4b73957ad"
+dependencies = [
+ "http-body-util",
+ "hyper 1.2.0",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tokio-tungstenite 0.21.0",
+ "tungstenite 0.21.0",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.3"
@@ -2710,7 +2690,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
 "equivalent",
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 ]

 [[package]]
@@ -2972,7 +2952,7 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc"
 dependencies = [
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 ]

 [[package]]
@@ -3025,7 +3005,7 @@ checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
 "bytes",
 "crossbeam-utils",
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 "itoa",
 "lasso",
 "measured-derive",
@@ -3587,7 +3567,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
 dependencies = [
 "dlv-list",
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 ]

 [[package]]
@@ -3908,14 +3888,13 @@ dependencies = [

 [[package]]
 name = "parquet"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "49.0.0"
+source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
 dependencies = [
 "ahash",
 "bytes",
 "chrono",
- "half 2.4.1",
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 "num",
 "num-bigint",
 "paste",
@@ -3923,13 +3902,12 @@ dependencies = [
 "thrift",
 "twox-hash",
 "zstd",
- "zstd-sys",
 ]

 [[package]]
 name = "parquet_derive"
-version = "51.0.0"
-source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
+version = "49.0.0"
+source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
 dependencies = [
 "parquet",
 "proc-macro2",
@@ -3956,9 +3934,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"

 [[package]]
 name = "pbkdf2"
-version = "0.12.2"
+version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
+checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
@@ -4381,7 +4359,6 @@ dependencies = [
 name = "proxy"
 version = "0.1.0"
 dependencies = [
- "ahash",
 "anyhow",
 "async-compression",
 "async-trait",
@@ -4398,14 +4375,12 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
- "crossbeam-deque",
 "dashmap",
 "env_logger",
 "fallible-iterator",
- "framed-websockets",
 "futures",
 "git-version",
- "hashbrown 0.14.5",
+ "hashbrown 0.13.2",
 "hashlink",
 "hex",
 "hmac",
@@ -4415,6 +4390,7 @@ dependencies = [
 "humantime",
 "hyper 0.14.26",
 "hyper 1.2.0",
+ "hyper-tungstenite",
 "hyper-util",
 "indexmap 2.0.1",
 "ipnet",
@@ -4459,6 +4435,7 @@ dependencies = [
 "smol_str",
 "socket2 0.5.5",
 "subtle",
+ "sync_wrapper",
 "task-local-extensions",
 "thiserror",
 "tikv-jemalloc-ctl",
@@ -4467,7 +4444,6 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
- "tokio-tungstenite",
 "tokio-util",
 "tower-service",
 "tracing",
@@ -5976,7 +5952,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#c1820b28664b5df68de7f043fccf2ed5d67b6ae8"

 [[package]]
 name = "syn"
@@ -6404,7 +6380,19 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite",
+ "tungstenite 0.20.1",
+]
+
+[[package]]
+name = "tokio-tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38"
+dependencies = [
+ "futures-util",
+ "log",
+ "tokio",
+ "tungstenite 0.21.0",
 ]

 [[package]]
@@ -6418,7 +6406,7 @@ dependencies = [
 "futures-io",
 "futures-sink",
 "futures-util",
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 "pin-project-lite",
 "tokio",
 "tracing",
@@ -6700,6 +6688,25 @@ dependencies = [
 "utf-8",
 ]

+[[package]]
+name = "tungstenite"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1"
+dependencies = [
+ "byteorder",
+ "bytes",
+ "data-encoding",
+ "http 1.1.0",
+ "httparse",
+ "log",
+ "rand 0.8.5",
+ "sha1",
+ "thiserror",
+ "url",
+ "utf-8",
+]
+
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7470,7 +7477,6 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
- "ahash",
 "anyhow",
 "aws-config",
 "aws-runtime",
@@ -7496,7 +7502,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
- "hashbrown 0.14.5",
+ "hashbrown 0.14.0",
 "hex",
 "hmac",
 "hyper 0.14.26",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,7 +41,6 @@ license = "Apache-2.0"

 ## All dependency versions, used in the project
 [workspace.dependencies]
-ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
@@ -75,7 +74,6 @@ clap = { version = "4.0", features = ["derive"] }
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
-crossbeam-deque = "0.8.5"
 crossbeam-utils = "0.8.5"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
@@ -83,14 +81,13 @@ enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
-framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
-hashbrown = "0.14"
-hashlink = "0.9.1"
+hashbrown = "0.13"
+hashlink = "0.8.4"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
@@ -101,7 +98,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-tokio-tungstenite = "0.20.0"
+hyper-tungstenite = "0.13.0"
 indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
@@ -124,8 +121,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "51.0.0"
+parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
@@ -161,8 +158,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
-svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
+# https://github.com/nical/rust_debug/pull/4
+svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
@@ -246,8 +243,8 @@ tonic-build = "0.9"
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 # bug fixes for UUID
-parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
-parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
+parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }

 ################# Binary contents sections

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -87,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION v2.31.0
+ENV MOLD_VERSION v2.4.0
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,17 +241,11 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-COPY patches/pgvector.patch /pgvector.patch
-
-# By default, pgvector Makefile uses `-march=native`. We don't want that, 
-# because we build the images on different machines than where we run them.
-# Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
-    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
-    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

 #########################################################################################
--- a/README.md
+++ b/README.md
@@ -1,6 +1,4 @@
-[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
-
-
+[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)

 # Neon

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,12 +27,10 @@ reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
-tokio-stream.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
-thiserror.workspace = true
 url.workspace = true

 compute_api.workspace = true
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -1,116 +0,0 @@
-use compute_api::{
-    responses::CatalogObjects,
-    spec::{Database, Role},
-};
-use futures::Stream;
-use postgres::{Client, NoTls};
-use std::{path::Path, process::Stdio, result::Result, sync::Arc};
-use tokio::{
-    io::{AsyncBufReadExt, BufReader},
-    process::Command,
-    task,
-};
-use tokio_stream::{self as stream, StreamExt};
-use tokio_util::codec::{BytesCodec, FramedRead};
-use tracing::warn;
-
-use crate::{
-    compute::ComputeNode,
-    pg_helpers::{get_existing_dbs, get_existing_roles},
-};
-
-pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
-    let connstr = compute.connstr.clone();
-    task::spawn_blocking(move || {
-        let mut client = Client::connect(connstr.as_str(), NoTls)?;
-        let roles: Vec<Role>;
-        {
-            let mut xact = client.transaction()?;
-            roles = get_existing_roles(&mut xact)?;
-        }
-        let databases: Vec<Database> = get_existing_dbs(&mut client)?.values().cloned().collect();
-
-        Ok(CatalogObjects { roles, databases })
-    })
-    .await?
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum SchemaDumpError {
-    #[error("Database does not exist.")]
-    DatabaseDoesNotExist,
-    #[error("Failed to execute pg_dump.")]
-    IO(#[from] std::io::Error),
-}
-
-// It uses the pg_dump utility to dump the schema of the specified database.
-// The output is streamed back to the caller and supposed to be streamed via HTTP.
-//
-// Before return the result with the output, it checks that pg_dump produced any output.
-// If not, it tries to parse the stderr output to determine if the database does not exist
-// and special error is returned.
-//
-// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature.
-pub async fn get_database_schema(
-    compute: &Arc<ComputeNode>,
-    dbname: &str,
-) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>>, SchemaDumpError> {
-    let pgbin = &compute.pgbin;
-    let basepath = Path::new(pgbin).parent().unwrap();
-    let pgdump = basepath.join("pg_dump");
-    let mut connstr = compute.connstr.clone();
-    connstr.set_path(dbname);
-    let mut cmd = Command::new(pgdump)
-        .arg("--schema-only")
-        .arg(connstr.as_str())
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        .kill_on_drop(true)
-        .spawn()?;
-
-    let stdout = cmd.stdout.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
-    })?;
-
-    let stderr = cmd.stderr.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
-    })?;
-
-    let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
-    let stderr_reader = BufReader::new(stderr);
-
-    let first_chunk = match stdout_reader.next().await {
-        Some(Ok(bytes)) if !bytes.is_empty() => bytes,
-        Some(Err(e)) => {
-            return Err(SchemaDumpError::IO(e));
-        }
-        _ => {
-            let mut lines = stderr_reader.lines();
-            if let Some(line) = lines.next_line().await? {
-                if line.contains(&format!("FATAL:  database \"{}\" does not exist", dbname)) {
-                    return Err(SchemaDumpError::DatabaseDoesNotExist);
-                }
-                warn!("pg_dump stderr: {}", line)
-            }
-            tokio::spawn(async move {
-                while let Ok(Some(line)) = lines.next_line().await {
-                    warn!("pg_dump stderr: {}", line)
-                }
-            });
-
-            return Err(SchemaDumpError::IO(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "failed to start pg_dump",
-            )));
-        }
-    };
-    let initial_stream = stream::once(Ok(first_chunk.freeze()));
-    // Consume stderr and log warnings
-    tokio::spawn(async move {
-        let mut lines = stderr_reader.lines();
-        while let Ok(Some(line)) = lines.next_line().await {
-            warn!("pg_dump stderr: {}", line)
-        }
-    });
-    Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze()))))
-}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,21 +5,17 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;

-use crate::catalog::SchemaDumpError;
-use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};

 use anyhow::Result;
-use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
 use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;
-use utils::http::request::must_get_query_param;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
    ComputeStatusResponse {
@@ -137,34 +133,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        (&Method::GET, "/dbs_and_roles") => {
-            info!("serving /dbs_and_roles GET request",);
-            match get_dbs_and_roles(compute).await {
-                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
-                Err(_) => {
-                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::GET, "/database_schema") => {
-            let database = match must_get_query_param(&req, "database") {
-                Err(e) => return e.into_response(),
-                Ok(database) => database,
-            };
-            info!("serving /database_schema GET request with database: {database}",);
-            match get_database_schema(compute, &database).await {
-                Ok(res) => render_plain(Body::wrap_stream(res)),
-                Err(SchemaDumpError::DatabaseDoesNotExist) => {
-                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
-                }
-                Err(e) => {
-                    error!("can't get schema dump: {}", e);
-                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
@@ -335,25 +303,10 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
    };
    Response::builder()
        .status(status)
-        .header(CONTENT_TYPE, "application/json")
        .body(Body::from(serde_json::to_string(&error).unwrap()))
        .unwrap()
 }

-fn render_json(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "application/json")
-        .body(body)
-        .unwrap()
-}
-
-fn render_plain(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "text/plain")
-        .body(body)
-        .unwrap()
-}
-
 async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
    {
        let mut state = compute.state.lock().unwrap();
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -68,51 +68,6 @@ paths:
              schema:
                $ref: "#/components/schemas/Info"

-  /dbs_and_roles:
-    get:
-      tags:
-        - Info
-      summary: Get databases and roles in the catalog.
-      description: ""
-      operationId: getDbsAndRoles
-      responses:
-        200:
-          description: Compute schema objects
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/DbsAndRoles"
-
-  /database_schema:
-    get:
-      tags:
-        - Info
-      summary: Get schema dump
-      parameters:
-        - name: database
-          in: query
-          description: Database name to dump.
-          required: true
-          schema:
-            type: string
-          example: "postgres"
-      description: Get schema dump in SQL format.
-      operationId: getDatabaseSchema
-      responses:
-        200:
-          description: Schema dump
-          content:
-            text/plain:
-              schema:
-                type: string
-                description: Schema dump in SQL format.
-        404:
-          description: Non existing database.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-
  /check_writability:
    post:
      tags:
@@ -274,73 +229,6 @@ components:
        num_cpus:
          type: integer

-    DbsAndRoles:
-      type: object
-      description: Databases and Roles
-      required:
-        - roles
-        - databases
-      properties:
-        roles:
-          type: array
-          items:
-            $ref: "#/components/schemas/Role"
-        databases:
-          type: array
-          items:
-            $ref: "#/components/schemas/Database"
-
-    Database:
-      type: object
-      description: Database
-      required:
-        - name
-        - owner
-        - restrict_conn
-        - invalid
-      properties:
-        name:
-          type: string
-        owner:
-          type: string
-        options:
-          type: array
-          items:
-            $ref: "#/components/schemas/GenericOption"
-        restrict_conn:
-          type: boolean
-        invalid:
-          type: boolean
-
-    Role:
-      type: object
-      description: Role
-      required:
-        - name
-      properties:
-        name:
-          type: string
-        encrypted_password:
-          type: string
-        options:
-          type: array
-          items:
-            $ref: "#/components/schemas/GenericOption"
-
-    GenericOption:
-      type: object
-      description: Schema Generic option
-      required:
-        - name
-        - vartype
-      properties:
-        name:
-          type: string
-        value:
-          type: string
-        vartype:
-          type: string
-
    ComputeState:
      type: object
      required:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,7 +8,6 @@ pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
-pub mod catalog;
 pub mod compute;
 pub mod extension_server;
 pub mod monitor;
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -1,5 +1,3 @@
-use std::path::Path;
-
 use anyhow::{anyhow, Context};
 use tracing::warn;

@@ -19,24 +17,17 @@ pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
        .arg(size_bytes.to_string())
        .spawn();

+    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
+        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
+        return Ok(());
+    }
+
    child_result
        .context("spawn() failed")
        .and_then(|mut child| child.wait().context("wait() failed"))
        .and_then(|status| match status.success() {
            true => Ok(()),
-            false => {
-                // The command failed. Maybe it was because the resize-swap file doesn't exist?
-                // The --once flag causes it to delete itself on success so we don't disable swap
-                // while postgres is running; maybe this is fine.
-                match Path::new(RESIZE_SWAP_BIN).try_exists() {
-                    Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")),
-                    // The path doesn't exist; we're actually ok 
-                    Ok(false) => {
-                        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
-                        Ok(())
-                    },
-                }
-            }
+            false => Err(anyhow!("process exited with {status}")),
        })
        // wrap any prior error with the overall context that we couldn't run the command
        .with_context(|| {
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -152,9 +152,6 @@ pub struct NeonStorageControllerConf {
    /// Heartbeat timeout before marking a node offline
    #[serde(with = "humantime_serde")]
    pub max_unavailable: Duration,
-
-    /// Threshold for auto-splitting a tenant into shards
-    pub split_threshold: Option<u64>,
 }

 impl NeonStorageControllerConf {
@@ -167,7 +164,6 @@ impl Default for NeonStorageControllerConf {
    fn default() -> Self {
        Self {
            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
-            split_threshold: None,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -243,13 +243,9 @@ impl StorageController {
                anyhow::bail!("initdb failed with status {status}");
            }

-            // Write a minimal config file:
-            // - Specify the port, since this is chosen dynamically
-            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
-            //   the storage controller we don't want a slow local disk to interfere with that.
            tokio::fs::write(
                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", self.postgres_port),
+                format!("port = {}", self.postgres_port),
            )
            .await?;
        };
@@ -309,10 +305,6 @@ impl StorageController {
            ));
        }

-        if let Some(split_threshold) = self.config.split_threshold.as_ref() {
-            args.push(format!("--split-threshold={split_threshold}"))
-        }
-
        background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -1,4 +1,4 @@
-ARG REPOSITORY=neondatabase
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
 ARG COMPUTE_IMAGE=compute-node-v14
 ARG TAG=latest

--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -8,6 +8,8 @@
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).

+# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
+
 set -eux -o pipefail

 SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,7 +3,7 @@
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};

-use crate::spec::{ComputeSpec, Database, Role};
+use crate::spec::ComputeSpec;

 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -113,12 +113,6 @@ pub struct ComputeMetrics {
    pub total_ext_download_size: u64,
 }

-#[derive(Clone, Debug, Default, Serialize)]
-pub struct CatalogObjects {
-    pub roles: Vec<Role>,
-    pub databases: Vec<Database>,
-}
-
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 /// This is not actually a compute API response, so consider moving
 /// to a different place.
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,7 +307,7 @@ impl KeySpace {
    }

    /// Merge another keyspace into the current one.
-    /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
+    /// Note: the keyspaces must not ovelap (enforced via assertions)
    pub fn merge(&mut self, other: &KeySpace) {
        let all_ranges = self
            .ranges
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,7 +9,7 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    sync::atomic::AtomicUsize,
+    str::FromStr,
    time::{Duration, SystemTime},
 };

@@ -161,22 +161,6 @@ impl std::fmt::Debug for TenantState {
    }
 }

-/// A temporary lease to a specific lsn inside a timeline.
-/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`.
-#[serde_as]
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
-pub struct LsnLease {
-    #[serde_as(as = "SystemTimeAsRfc3339Millis")]
-    pub valid_until: SystemTime,
-}
-
-serde_with::serde_conv!(
-    SystemTimeAsRfc3339Millis,
-    SystemTime,
-    |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(),
-    |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
-);
-
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
@@ -305,7 +289,7 @@ pub struct TenantConfig {
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
    // defer parsing compaction_algorithm, like eviction_policy
-    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub compaction_algorithm: Option<CompactionAlgorithm>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -324,100 +308,28 @@ pub struct TenantConfig {
    pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }

-/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
-/// tenant config. When the first aux file written, the policy will be persisted in the
-/// `index_part.json` file and has a limited migration path.
-///
-/// Currently, we only allow the following migration path:
-///
-/// Unset -> V1
-///       -> V2
-///       -> CrossValidation -> V2
-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum AuxFilePolicy {
-    /// V1 aux file policy: store everything in AUX_FILE_KEY
-    #[strum(ascii_case_insensitive)]
    V1,
-    /// V2 aux file policy: store in the AUX_FILE keyspace
-    #[strum(ascii_case_insensitive)]
    V2,
-    /// Cross validation runs both formats on the write path and does validation
-    /// on the read path.
-    #[strum(ascii_case_insensitive)]
    CrossValidation,
 }

-impl AuxFilePolicy {
-    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
-        matches!(
-            (from, to),
-            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
-        )
-    }
+impl FromStr for AuxFilePolicy {
+    type Err = anyhow::Error;

-    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
-    pub fn default_tenant_config() -> Self {
-        Self::V1
-    }
-}
-
-/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
-pub struct AtomicAuxFilePolicy(AtomicUsize);
-
-impl AtomicAuxFilePolicy {
-    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
-        Self(AtomicUsize::new(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-        ))
-    }
-
-    pub fn load(&self) -> Option<AuxFilePolicy> {
-        match self.0.load(std::sync::atomic::Ordering::Acquire) {
-            0 => None,
-            other => Some(AuxFilePolicy::from_usize(other)),
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.to_lowercase();
+        if s == "v1" {
+            Ok(Self::V1)
+        } else if s == "v2" {
+            Ok(Self::V2)
+        } else if s == "crossvalidation" || s == "cross_validation" {
+            Ok(Self::CrossValidation)
+        } else {
+            anyhow::bail!("cannot parse {} to aux file policy", s)
        }
    }
-
-    pub fn store(&self, policy: Option<AuxFilePolicy>) {
-        self.0.store(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-            std::sync::atomic::Ordering::Release,
-        );
-    }
-}
-
-impl AuxFilePolicy {
-    pub fn to_usize(self) -> usize {
-        match self {
-            Self::V1 => 1,
-            Self::CrossValidation => 2,
-            Self::V2 => 3,
-        }
-    }
-
-    pub fn try_from_usize(this: usize) -> Option<Self> {
-        match this {
-            1 => Some(Self::V1),
-            2 => Some(Self::CrossValidation),
-            3 => Some(Self::V2),
-            _ => None,
-        }
-    }
-
-    pub fn from_usize(this: usize) -> Self {
-        Self::try_from_usize(this).unwrap()
-    }
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -438,30 +350,13 @@ impl EvictionPolicy {
    }
 }

-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
 pub enum CompactionAlgorithm {
-    #[strum(disabled)]
-    NotSpecified,
    Legacy,
    Tiered,
 }

-#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
-pub struct CompactionAlgorithmSettings {
-    pub kind: CompactionAlgorithm,
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -709,9 +604,6 @@ pub struct TimelineInfo {
    pub state: TimelineState,

    pub walreceiver_status: String,
-
-    /// The last aux file policy being used on this timeline
-    pub last_aux_file_policy: Option<AuxFilePolicy>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -818,8 +710,6 @@ pub enum HistoricLayerInfo {
        lsn_end: Lsn,
        remote: bool,
        access_stats: LayerAccessStats,
-
-        l0: bool,
    },
    Image {
        layer_file_name: String,
@@ -855,16 +745,6 @@ impl HistoricLayerInfo {
        };
        *field = value;
    }
-    pub fn layer_file_size(&self) -> u64 {
-        match self {
-            HistoricLayerInfo::Delta {
-                layer_file_size, ..
-            } => *layer_file_size,
-            HistoricLayerInfo::Image {
-                layer_file_size, ..
-            } => *layer_file_size,
-        }
-    }
 }

 #[derive(Debug, Serialize, Deserialize)]
@@ -872,16 +752,6 @@ pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
 }

-#[derive(Debug, Serialize, Deserialize)]
-pub struct IngestAuxFilesRequest {
-    pub aux_files: HashMap<String, String>,
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-pub struct ListAuxFilesRequest {
-    pub lsn: Lsn,
-}
-
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct DownloadRemoteLayersTaskInfo {
    pub task_id: String,
@@ -906,6 +776,9 @@ pub struct TimelineGcRequest {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
    pub pid: u32,
+    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
+    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
+    pub kind: Cow<'static, str>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -944,55 +817,6 @@ pub struct TenantScanRemoteStorageResponse {
    pub shards: Vec<TenantScanRemoteStorageShard>,
 }

-#[derive(Serialize, Deserialize, Debug, Clone)]
-#[serde(rename_all = "snake_case")]
-pub enum TenantSorting {
-    ResidentSize,
-    MaxLogicalSize,
-}
-
-impl Default for TenantSorting {
-    fn default() -> Self {
-        Self::ResidentSize
-    }
-}
-
-#[derive(Serialize, Deserialize, Debug, Clone)]
-pub struct TopTenantShardsRequest {
-    // How would you like to sort the tenants?
-    pub order_by: TenantSorting,
-
-    // How many results?
-    pub limit: usize,
-
-    // Omit tenants with more than this many shards (e.g. if this is the max number of shards
-    // that the caller would ever split to)
-    pub where_shards_lt: Option<ShardCount>,
-
-    // Omit tenants where the ordering metric is less than this (this is an optimization to
-    // let us quickly exclude numerous tiny shards)
-    pub where_gt: Option<u64>,
-}
-
-#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
-pub struct TopTenantShardItem {
-    pub id: TenantShardId,
-
-    /// Total size of layers on local disk for all timelines in this tenant
-    pub resident_size: u64,
-
-    /// Total size of layers in remote storage for all timelines in this tenant
-    pub physical_size: u64,
-
-    /// The largest logical size of a timeline within this tenant
-    pub max_logical_size: u64,
-}
-
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub struct TopTenantShardsResponse {
-    pub shards: Vec<TopTenantShardItem>,
-}
-
 pub mod virtual_file {
    #[derive(
        Copy,
@@ -1418,7 +1242,6 @@ impl PagestreamBeMessage {
 #[cfg(test)]
 mod tests {
    use serde_json::json;
-    use std::str::FromStr;

    use super::*;

@@ -1626,69 +1449,4 @@ mod tests {
            assert_eq!(actual, expected, "example on {line}");
        }
    }
-
-    #[test]
-    fn test_aux_file_migration_path() {
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V1
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V2
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::CrossValidation
-        ));
-        // Self-migration is not a valid migration path, and the caller should handle it by itself.
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations not allowed
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::CrossValidation
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations allowed
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V2
-        ));
-    }
-
-    #[test]
-    fn test_aux_parse() {
-        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(
-            AuxFilePolicy::from_str("cross-validation").unwrap(),
-            AuxFilePolicy::CrossValidation
-        );
-    }
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -125,7 +125,7 @@ impl ShardCount {

    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
+    pub fn new(val: u8) -> Self {
        Self(val)
    }
 }
@@ -559,14 +559,6 @@ impl ShardIdentity {
        }
    }

-    /// Obtains the shard number and count combined into a `ShardIndex`.
-    pub fn shard_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_count: self.count,
-            shard_number: self.number,
-        }
-    }
-
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -820,11 +820,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        Ok(ProcessMsgResult::Continue)
    }

-    /// - Log as info/error result of handling COPY stream and send back
-    ///   ErrorResponse if that makes sense.
-    /// - Shutdown the stream if we got Terminate.
-    /// - Then close the connection because we don't handle exiting from COPY
-    ///   stream normally.
+    /// Log as info/error result of handling COPY stream and send back
+    /// ErrorResponse if that makes sense. Shutdown the stream if we got
+    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
+    /// close.
    pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
        use CopyStreamHandlerEnd::*;

@@ -850,6 +849,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            }
        }

+        if let Terminate = &end {
+            self.state = ProtoState::Closed;
+        }
+
        let err_to_send_and_errcode = match &end {
            ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
            Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -879,12 +882,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                error!("failed to send ErrorResponse: {}", ee);
            }
        }
-
-        // Proper COPY stream finishing to continue using the connection is not
-        // implemented at the server side (we don't need it so far). To prevent
-        // further usages of the connection, close it.
-        self.framed.shutdown().await.ok();
-        self.state = ProtoState::Closed;
    }
 }

--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -178,13 +178,6 @@ impl PgConnectionConfig {
    }
 }

-impl fmt::Display for PgConnectionConfig {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        // The password is intentionally hidden and not part of this display string.
-        write!(f, "postgresql://{}:{}", self.host, self.port)
-    }
-}
-
 impl fmt::Debug for PgConnectionConfig {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")`
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
            // Is there enough space on the page for another logical message and an
            // XLOG_SWITCH? If not, start over.
            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
+            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
                continue;
            }

@@ -373,29 +373,31 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
                &[&(repeats as i32)],
            )?;
-            info!(
-                "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
-                client.pg_current_wal_insert_lsn()?,
-                XLOG_SIZE_OF_XLOG_RECORD
-            );
-
-            // Emit the XLOG_SWITCH
-            let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-            let xlog_switch_record_end: PgLsn =
-                client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-
-            if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
-                != XLOG_SIZE_OF_XLOG_SHORT_PHD
-            {
-                warn!(
-                    "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating",
-                    xlog_switch_record_end,
-                    u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
-                );
-                continue;
-            }
-            return Ok(vec![before_xlog_switch, xlog_switch_record_end]);
+            break;
        }
+        info!(
+            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+            client.pg_current_wal_insert_lsn()?,
+            XLOG_SIZE_OF_XLOG_RECORD
+        );
+
+        // Emit the XLOG_SWITCH
+        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let next_segment = PgLsn::from(0x0200_0000);
+        ensure!(
+            xlog_switch_record_end < next_segment,
+            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
+            xlog_switch_record_end,
+            next_segment
+        );
+        ensure!(
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
+            xlog_switch_record_end,
+            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+        );
+        Ok(vec![before_xlog_switch, xlog_switch_record_end])
    }
 }

--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -50,9 +50,6 @@ pub struct SkTimelineInfo {
    pub safekeeper_connstr: Option<String>,
    #[serde(default)]
    pub http_connstr: Option<String>,
-    // Minimum of all active RO replicas flush LSN
-    #[serde(default = "lsn_invalid")]
-    pub standby_horizon: Lsn,
 }

 #[derive(Debug, Clone, Deserialize, Serialize)]
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -9,33 +9,6 @@ use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-/// Declare a failpoint that can use the `pause` failpoint action.
-/// We don't want to block the executor thread, hence, spawn_blocking + await.
-#[macro_export]
-macro_rules! pausable_failpoint {
-    ($name:literal) => {
-        if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
-                move || {
-                    let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
-                }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
-            }
-        }
-    };
-}
-
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -135,8 +135,7 @@ impl Gate {
        let started_at = std::time::Instant::now();
        let mut do_close = std::pin::pin!(self.do_close());

-        // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms
-        let nag_after = Duration::from_millis(100);
+        let nag_after = Duration::from_secs(1);

        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
            return;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -496,9 +496,9 @@ mod tests {
                // TODO: When updating Postgres versions, this test will cause
                // problems. Postgres version in message needs updating.
                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -30,27 +30,47 @@
 //! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
-//! short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
-//! short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
-//! short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
-//! short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
-//! short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
-//! short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
-//! short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
-//! medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
-//! medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
-//! medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
-//! medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
-//! medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
-//! medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
-//! medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
-//! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
+//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
+//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
+//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
+//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
+//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
+//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
+//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
+//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
+//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
+//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
+//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
+//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
+//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
+//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
+//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
 //! ```

 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{
+    config::PageServerConf,
+    walrecord::NeonWalRecord,
+    walredo::{PostgresRedoManager, ProcessKind},
+};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    sync::Arc,
@@ -60,32 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};

 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-short"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::short_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
        }
-    }
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::medium_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
        }
    }
 }
@@ -93,10 +120,16 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);

 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl(
+    process_kind: ProcessKind,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+    nclients: u64,
+) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

-    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    conf.walredo_process_kind = process_kind;
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

@@ -125,13 +158,27 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
        });
    }

-    rt.block_on(async move {
+    let elapsed = rt.block_on(async move {
        let mut total_wallclock_time = Duration::ZERO;
        while let Some(res) = tasks.join_next().await {
            total_wallclock_time += res.unwrap();
        }
        total_wallclock_time
-    })
+    });
+
+    // consistency check to ensure process kind setting worked
+    if nredos_per_client > 0 {
+        assert_eq!(
+            manager
+                .status()
+                .process
+                .map(|p| p.kind)
+                .expect("the benchmark work causes a walredo process to be spawned"),
+            std::borrow::Cow::Borrowed(process_kind.into())
+        );
+    }
+
+    elapsed
 }

 async fn client(
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,12 +1,8 @@
-use std::collections::HashMap;
-
-use bytes::Bytes;
 use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
-    lsn::Lsn,
 };

 pub mod util;
@@ -490,18 +486,6 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    pub async fn top_tenant_shards(
-        &self,
-        request: TopTenantShardsRequest,
-    ) -> Result<TopTenantShardsResponse> {
-        let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint);
-        self.request(Method::POST, uri, request)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
    pub async fn layer_map_info(
        &self,
        tenant_shard_id: TenantShardId,
@@ -565,57 +549,4 @@ impl Client {
            }),
        }
    }
-
-    pub async fn ingest_aux_files(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        aux_files: HashMap<String, String>,
-    ) -> Result<bool> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/ingest_aux_files",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
-        );
-        let resp = self
-            .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files })
-            .await?;
-        match resp.status() {
-            StatusCode::OK => Ok(true),
-            status => Err(match resp.json::<HttpErrorBody>().await {
-                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
-                Err(_) => {
-                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
-                }
-            }),
-        }
-    }
-
-    pub async fn list_aux_files(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-    ) -> Result<HashMap<String, Bytes>> {
-        let uri = format!(
-            "{}/v1/tenant/{}/timeline/{}/list_aux_files",
-            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
-        );
-        let resp = self
-            .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn })
-            .await?;
-        match resp.status() {
-            StatusCode::OK => {
-                let resp: HashMap<String, Bytes> = resp.json().await.map_err(|e| {
-                    Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}"))
-                })?;
-                Ok(resp)
-            }
-            status => Err(match resp.json::<HttpErrorBody>().await {
-                Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
-                Err(_) => {
-                    Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
-                }
-            }),
-        }
-    }
 }
--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -1,5 +1,4 @@
 use clap::{Parser, Subcommand};
-use pageserver_compaction::helpers::PAGE_SZ;
 use pageserver_compaction::simulator::MockTimeline;
 use rand::Rng;
 use std::io::Write;
@@ -52,7 +51,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
    let mut executor = MockTimeline::new();

    // Convert the logical size in MB into a key range.
-    let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ);
+    let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
    //let key_range = u64::MIN..u64::MAX;
    println!(
        "starting simulation with key range {:016X}-{:016X}",
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -25,7 +25,7 @@ use std::collections::{HashSet, VecDeque};
 use std::ops::Range;

 use crate::helpers::{
-    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ,
+    accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with,
 };
 use crate::interface::*;
 use utils::lsn::Lsn;
@@ -379,7 +379,7 @@ where
                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                .await?,
            &self.shard_identity,
-        ) * PAGE_SZ;
+        ) * 8192;

        let wal_size = job
            .input_layers
@@ -441,7 +441,7 @@ where
        let mut window = KeyspaceWindow::new(
            E::Key::MIN..E::Key::MAX,
            keyspace,
-            self.target_file_size / PAGE_SZ,
+            self.target_file_size / 8192,
        );
        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
            new_jobs.push(CompactionJob::<E> {
@@ -530,6 +530,8 @@ where
        // If we have accumulated only a narrow band of keyspace, create an
        // image layer. Otherwise write a delta layer.

+        // FIXME: deal with the case of lots of values for same key
+
        // FIXME: we are ignoring images here. Did we already divide the work
        // so that we won't encounter them here?

@@ -548,93 +550,38 @@ where
        let mut new_jobs = Vec::new();

        // Slide a window through the keyspace
-        let mut key_accum =
-            std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size));
+        let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
        let mut all_in_window: bool = false;
        let mut window = Window::new();
-
-        // Helper function to create a job for a new delta layer with given key-lsn
-        // rectangle.
-        let create_delta_job = |key_range, lsn_range: &Range<Lsn>, new_jobs: &mut Vec<_>| {
-            // The inputs for the job are all the input layers of the original job that
-            // overlap with the rectangle.
-            let batch_layers: Vec<LayerId> = job
-                .input_layers
-                .iter()
-                .filter(|layer_id| {
-                    overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
-                })
-                .cloned()
-                .collect();
-            assert!(!batch_layers.is_empty());
-            new_jobs.push(CompactionJob {
-                key_range,
-                lsn_range: lsn_range.clone(),
-                strategy: CompactionStrategy::CreateDelta,
-                input_layers: batch_layers,
-                completed: false,
-            });
-        };
-
        loop {
-            if all_in_window && window.is_empty() {
+            if all_in_window && window.elems.is_empty() {
                // All done!
                break;
            }
-
-            // If we now have enough keyspace for next delta layer in the window, create a
-            // new delta layer
            if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
            {
-                create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
-                continue;
-            }
-            assert!(!all_in_window);
-
-            // Process next key in the key space
-            match key_accum.next().await.transpose()? {
-                None => {
-                    all_in_window = true;
-                }
-                Some(next_key) if next_key.partition_lsns.is_empty() => {
-                    // Normal case: extend the window by the key
+                let batch_layers: Vec<LayerId> = job
+                    .input_layers
+                    .iter()
+                    .filter(|layer_id| {
+                        overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
+                    })
+                    .cloned()
+                    .collect();
+                assert!(!batch_layers.is_empty());
+                new_jobs.push(CompactionJob {
+                    key_range,
+                    lsn_range: job.lsn_range.clone(),
+                    strategy: CompactionStrategy::CreateDelta,
+                    input_layers: batch_layers,
+                    completed: false,
+                });
+            } else {
+                assert!(!all_in_window);
+                if let Some(next_key) = key_accum.next().await.transpose()? {
                    window.feed(next_key.key, next_key.size);
-                }
-                Some(next_key) => {
-                    // A key with too large size impact for a single delta layer. This
-                    // case occurs if you make a huge number of updates for a single key.
-                    //
-                    // Drain the window with has_more = false to make a clean cut before
-                    // the key, and then make dedicated delta layers for the single key.
-                    //
-                    // We cannot cluster the key with the others, because we don't want
-                    // layer files to overlap with each other in the lsn,key space (no
-                    // overlaps for the rectangles).
-                    let key = next_key.key;
-                    debug!("key {key} with size impact larger than the layer size");
-                    while !window.is_empty() {
-                        let has_more = false;
-                        let key_range = window.choose_next_delta(self.target_file_size, has_more)
-                            .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window");
-                        create_delta_job(key_range, &job.lsn_range, &mut new_jobs);
-                    }
-
-                    // Not really required: but here for future resilience:
-                    // We make a "gap" here, so any structure the window holds should
-                    // probably be reset.
-                    window = Window::new();
-
-                    let mut prior_lsn = job.lsn_range.start;
-                    let mut lsn_ranges = Vec::new();
-                    for (lsn, _size) in next_key.partition_lsns.iter() {
-                        lsn_ranges.push(prior_lsn..*lsn);
-                        prior_lsn = *lsn;
-                    }
-                    lsn_ranges.push(prior_lsn..job.lsn_range.end);
-                    for lsn_range in lsn_ranges {
-                        let key_range = key..key.next();
-                        create_delta_job(key_range, &lsn_range, &mut new_jobs);
-                    }
+                } else {
+                    all_in_window = true;
                }
            }
        }
@@ -663,8 +610,8 @@ where
    }
 }

-/// Sliding window through keyspace and values for image layer
-/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points
+// Sliding window through keyspace and values
+// This is used by over_with_images to decide on good split points
 struct KeyspaceWindow<K> {
    head: KeyspaceWindowHead<K>,

@@ -804,9 +751,9 @@ struct WindowElement<K> {
    accum_size: u64,
 }

-/// Sliding window through keyspace and values for delta layer tiling
-///
-/// This is used to decide which delta layer to write next.
+// Sliding window through keyspace and values
+//
+// This is used to decide what layer to write next, from the beginning of the window.
 struct Window<K> {
    elems: VecDeque<WindowElement<K>>,

@@ -830,13 +777,11 @@ where
    fn feed(&mut self, key: K, size: u64) {
        let last_size;
        if let Some(last) = self.elems.back_mut() {
-            // We require the keys to be strictly increasing for the window.
-            // Keys should already have been deduplicated by `accum_key_values`
-            assert!(
-                last.last_key < key,
-                "last_key(={}) >= key(={key})",
-                last.last_key
-            );
+            assert!(last.last_key <= key);
+            if key == last.last_key {
+                last.accum_size += size;
+                return;
+            }
            last_size = last.accum_size;
        } else {
            last_size = 0;
@@ -858,10 +803,6 @@ where
        self.elems.front().unwrap().accum_size - self.splitoff_size
    }

-    fn is_empty(&self) -> bool {
-        self.elems.is_empty()
-    }
-
    fn commit_upto(&mut self, mut upto: usize) {
        while upto > 1 {
            let popped = self.elems.pop_front().unwrap();
@@ -924,7 +865,7 @@ where
        // If we're willing to stretch it up to 1.25 target size, could we
        // gobble up the rest of the work? This avoids creating very small
        // "tail" layers at the end of the keyspace
-        if !has_more && self.remain_size() < target_size * 5 / 4 {
+        if !has_more && self.remain_size() < target_size * 5 / 3 {
            self.commit_upto(self.elems.len());
        } else {
            let delta_split_at = self.find_size_split(target_size);
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -16,8 +16,6 @@ use std::pin::Pin;
 use std::task::{ready, Poll};
 use utils::lsn::Lsn;

-pub const PAGE_SZ: u64 = 8192;
-
 pub fn keyspace_total_size<K>(
    keyspace: &CompactionKeySpace<K>,
    shard_identity: &ShardIdentity,
@@ -237,14 +235,9 @@ pub struct KeySize<K> {
    pub key: K,
    pub num_values: u64,
    pub size: u64,
-    /// The lsns to partition at (if empty then no per-lsn partitioning)
-    pub partition_lsns: Vec<(Lsn, u64)>,
 }

-pub fn accum_key_values<'a, I, K, D, E>(
-    input: I,
-    target_size: u64,
-) -> impl Stream<Item = Result<KeySize<K>, E>>
+pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
    K: Eq + PartialOrd + Display + Copy,
    I: Stream<Item = Result<D, E>>,
@@ -256,35 +249,25 @@ where

        if let Some(first) = input.next().await {
            let first = first?;
-            let mut part_size = first.size();
            let mut accum: KeySize<K> = KeySize {
                key: first.key(),
                num_values: 1,
-                size: part_size,
-                partition_lsns: Vec::new(),
+                size: first.size(),
            };
            let mut last_key = accum.key;
            while let Some(this) = input.next().await {
                let this = this?;
                if this.key() == accum.key {
-                    let add_size = this.size();
-                    if part_size + add_size > target_size {
-                        accum.partition_lsns.push((this.lsn(), part_size));
-                        part_size = 0;
-                    }
-                    part_size += add_size;
-                    accum.size += add_size;
+                    accum.size += this.size();
                    accum.num_values += 1;
                } else {
                    assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
                    last_key = accum.key;
                    yield accum;
-                    part_size = this.size();
                    accum = KeySize {
                        key: this.key(),
                        num_values: 1,
-                        size: part_size,
-                        partition_lsns: Vec::new(),
+                        size: this.size(),
                    };
                }
            }
--- a/pageserver/compaction/src/identify_levels.rs
+++ b/pageserver/compaction/src/identify_levels.rs
@@ -184,12 +184,6 @@ impl<L> Level<L> {
        }
        let mut events: Vec<Event<K>> = Vec::new();
        for (idx, l) in self.layers.iter().enumerate() {
-            let key_range = l.key_range();
-            if key_range.end == key_range.start.next() && l.is_delta() {
-                // Ignore single-key delta layers as they can be stacked on top of each other
-                // as that is the only way to cut further.
-                continue;
-            }
            events.push(Event {
                key: l.key_range().start,
                layer_idx: idx,
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -14,7 +14,6 @@ use std::ops::Range;
 use std::sync::Arc;
 use std::sync::Mutex;

-use crate::helpers::PAGE_SZ;
 use crate::helpers::{merge_delta_keys, overlaps_with};

 use crate::interface;
@@ -380,8 +379,8 @@ impl interface::CompactionLayer<Key> for MockLayer {
    }
    fn file_size(&self) -> u64 {
        match self {
-            MockLayer::Delta(this) => this.file_size,
-            MockLayer::Image(this) => this.file_size,
+            MockLayer::Delta(this) => this.file_size(),
+            MockLayer::Image(this) => this.file_size(),
        }
    }
    fn short_id(&self) -> String {
@@ -510,7 +509,7 @@ impl interface::CompactionJobExecutor for MockTimeline {
        let new_layer = Arc::new(MockImageLayer {
            key_range: key_range.clone(),
            lsn_range: lsn..lsn,
-            file_size: accum_size * PAGE_SZ,
+            file_size: accum_size * 8192,
            deleted: Mutex::new(false),
        });
        info!(
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -20,6 +20,10 @@ pub(crate) fn setup_logging() {
 /// even if we produce an extremely narrow delta layer, spanning just that one
 /// key, we still too many records to fit in the target file size. We need to
 /// split in the LSN dimension too in that case.
+///
+/// TODO: The code to avoid this problem has not been implemented yet! So the
+/// assertion currently fails, but we need to make it not fail.
+#[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
    setup_logging();
@@ -39,9 +43,9 @@ async fn test_many_updates_for_single_key() {
    }
    for l in executor.live_layers.iter() {
        assert!(l.file_size() < executor.target_file_size * 2);
-        // Sanity check that none of the delta layers are empty either.
+        // sanity check that none of the delta layers are stupidly small either
        if l.is_delta() {
-            assert!(l.file_size() > 0);
+            assert!(l.file_size() > executor.target_file_size / 2);
        }
    }
 }
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -52,6 +52,7 @@

 use anyhow::{Context, Result};
 use pageserver::repository::Key;
+use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -82,11 +83,6 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    let split: Vec<&str> = name.split("__").collect();
    let keys: Vec<&str> = split[0].split('-').collect();
    let mut lsns: Vec<&str> = split[1].split('-').collect();
-
-    if lsns.last().expect("should").len() == 8 {
-        lsns.pop();
-    }
-
    if lsns.len() == 1 {
        lsns.push(lsns[0]);
    }
@@ -158,6 +154,10 @@ pub fn main() -> Result<()> {
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
+        if filename == METADATA_FILE_NAME {
+            // Don't try and parse "metadata" like a key-lsn range
+            continue;
+        }
        let (key_range, lsn_range) = parse_filename(filename);
        files.push(Layer {
            filename: filename.to_owned(),
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;

 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            #[derive(serde::Serialize)]
            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
+                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
                disk_consistent_lsn: Lsn,
                timeline_metadata: &'a TimelineMetadata,
            }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -219,7 +219,6 @@ fn handle_metadata(
    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
    println!("Current metadata:\n{meta:?}");
    let mut update_meta = false;
-    // TODO: simplify this part
    if let Some(disk_consistent_lsn) = disk_consistent_lsn {
        meta = TimelineMetadata::new(
            *disk_consistent_lsn,
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -1,98 +0,0 @@
-use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
-use pageserver_api::shard::TenantShardId;
-use utils::id::TenantTimelineId;
-use utils::lsn::Lsn;
-
-use std::collections::HashMap;
-use std::sync::Arc;
-
-/// Ingest aux files into the pageserver.
-#[derive(clap::Parser)]
-pub(crate) struct Args {
-    #[clap(long, default_value = "http://localhost:9898")]
-    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
-    page_service_connstring: String,
-    #[clap(long)]
-    pageserver_jwt: Option<String>,
-
-    targets: Option<Vec<TenantTimelineId>>,
-}
-
-pub(crate) fn main(args: Args) -> anyhow::Result<()> {
-    let rt = tokio::runtime::Builder::new_multi_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
-    let main_task = rt.spawn(main_impl(args));
-    rt.block_on(main_task).unwrap()
-}
-
-async fn main_impl(args: Args) -> anyhow::Result<()> {
-    let args: &'static Args = Box::leak(Box::new(args));
-
-    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
-        args.mgmt_api_endpoint.clone(),
-        args.pageserver_jwt.as_deref(),
-    ));
-
-    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
-        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
-            limit_to_first_n_targets: None,
-            targets: {
-                if let Some(targets) = &args.targets {
-                    if targets.len() != 1 {
-                        anyhow::bail!("must specify exactly one target");
-                    }
-                    Some(targets.clone())
-                } else {
-                    None
-                }
-            },
-        },
-    )
-    .await?;
-
-    let timeline = timelines[0];
-    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
-    let timeline_id = timeline.timeline_id;
-
-    println!("operating on timeline {}", timeline);
-
-    mgmt_api_client
-        .tenant_config(&TenantConfigRequest {
-            tenant_id: timeline.tenant_id,
-            config: TenantConfig {
-                switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                ..Default::default()
-            },
-        })
-        .await?;
-
-    for batch in 0..100 {
-        let items = (0..100)
-            .map(|id| {
-                (
-                    format!("pg_logical/mappings/{:03}.{:03}", batch, id),
-                    format!("{:08}", id),
-                )
-            })
-            .collect::<HashMap<_, _>>();
-        let file_cnt = items.len();
-        mgmt_api_client
-            .ingest_aux_files(tenant_shard_id, timeline_id, items)
-            .await?;
-        println!("ingested {file_cnt} files");
-    }
-
-    let files = mgmt_api_client
-        .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1))
-        .await?;
-
-    println!("{} files found", files.len());
-
-    anyhow::Ok(())
-}
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -2,11 +2,9 @@ use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};

 use pageserver_client::mgmt_api;
 use rand::seq::SliceRandom;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, info};
 use utils::id::{TenantTimelineId, TimelineId};

-use std::{f64, sync::Arc};
 use tokio::{
    sync::{mpsc, OwnedSemaphorePermit},
    task::JoinSet,
@@ -14,7 +12,10 @@ use tokio::{

 use std::{
    num::NonZeroUsize,
-    sync::atomic::{AtomicU64, Ordering},
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
    time::{Duration, Instant},
 };

@@ -50,31 +51,19 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
    Ok(())
 }

-#[derive(serde::Serialize)]
-struct Output {
-    downloads_count: u64,
-    downloads_bytes: u64,
-    evictions_count: u64,
-    timeline_restarts: u64,
-    #[serde(with = "humantime_serde")]
-    runtime: Duration,
-}
-
 #[derive(Debug, Default)]
 struct LiveStats {
-    evictions_count: AtomicU64,
-    downloads_count: AtomicU64,
-    downloads_bytes: AtomicU64,
+    evictions: AtomicU64,
+    downloads: AtomicU64,
    timeline_restarts: AtomicU64,
 }

 impl LiveStats {
    fn eviction_done(&self) {
-        self.evictions_count.fetch_add(1, Ordering::Relaxed);
+        self.evictions.fetch_add(1, Ordering::Relaxed);
    }
-    fn download_done(&self, size: u64) {
-        self.downloads_count.fetch_add(1, Ordering::Relaxed);
-        self.downloads_bytes.fetch_add(size, Ordering::Relaxed);
+    fn download_done(&self) {
+        self.downloads.fetch_add(1, Ordering::Relaxed);
    }
    fn timeline_restart_done(&self) {
        self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
@@ -103,49 +92,28 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    )
    .await?;

-    let token = CancellationToken::new();
    let mut tasks = JoinSet::new();

-    let periodic_stats = Arc::new(LiveStats::default());
-    let total_stats = Arc::new(LiveStats::default());
-
-    let start = Instant::now();
+    let live_stats = Arc::new(LiveStats::default());
    tasks.spawn({
-        let periodic_stats = Arc::clone(&periodic_stats);
-        let total_stats = Arc::clone(&total_stats);
-        let cloned_token = token.clone();
+        let live_stats = Arc::clone(&live_stats);
        async move {
            let mut last_at = Instant::now();
            loop {
-                if cloned_token.is_cancelled() {
-                    return;
-                }
                tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
                let now = Instant::now();
                let delta: Duration = now - last_at;
                last_at = now;

                let LiveStats {
-                    evictions_count,
-                    downloads_count,
-                    downloads_bytes,
+                    evictions,
+                    downloads,
                    timeline_restarts,
-                } = &*periodic_stats;
-                let evictions_count = evictions_count.swap(0, Ordering::Relaxed);
-                let downloads_count = downloads_count.swap(0, Ordering::Relaxed);
-                let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed);
+                } = &*live_stats;
+                let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
+                let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
                let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
-
-                total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed);
-                total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed);
-                total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed);
-                total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed);
-
-                let evictions_per_s = evictions_count as f64 / delta.as_secs_f64();
-                let downloads_per_s = downloads_count as f64 / delta.as_secs_f64();
-                let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64);
-
-                info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}");
+                info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
            }
        }
    });
@@ -156,42 +124,14 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                args,
                Arc::clone(&mgmt_api_client),
                tl,
-                Arc::clone(&periodic_stats),
-                token.clone(),
+                Arc::clone(&live_stats),
            ));
        }
    }
-    if let Some(runtime) = args.runtime {
-        tokio::spawn(async move {
-            tokio::time::sleep(runtime.into()).await;
-            token.cancel();
-        });
-    }

    while let Some(res) = tasks.join_next().await {
        res.unwrap();
    }
-    let end = Instant::now();
-    let duration: Duration = end - start;
-
-    let output = {
-        let LiveStats {
-            evictions_count,
-            downloads_count,
-            downloads_bytes,
-            timeline_restarts,
-        } = &*total_stats;
-        Output {
-            downloads_count: downloads_count.load(Ordering::Relaxed),
-            downloads_bytes: downloads_bytes.load(Ordering::Relaxed),
-            evictions_count: evictions_count.load(Ordering::Relaxed),
-            timeline_restarts: timeline_restarts.load(Ordering::Relaxed),
-            runtime: duration,
-        }
-    };
-    let output = serde_json::to_string_pretty(&output).unwrap();
-    println!("{output}");
-
    Ok(())
 }

@@ -200,7 +140,6 @@ async fn timeline_actor(
    mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
    timeline: TenantTimelineId,
    live_stats: Arc<LiveStats>,
-    token: CancellationToken,
 ) {
    // TODO: support sharding
    let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
@@ -210,7 +149,7 @@ async fn timeline_actor(
        layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
        concurrency: Arc<tokio::sync::Semaphore>,
    }
-    while !token.is_cancelled() {
+    loop {
        debug!("restarting timeline");
        let layer_map_info = mgmt_api_client
            .layer_map_info(tenant_shard_id, timeline.timeline_id)
@@ -246,7 +185,7 @@ async fn timeline_actor(

        live_stats.timeline_restart_done();

-        while !token.is_cancelled() {
+        loop {
            assert!(!timeline.joinset.is_empty());
            if let Some(res) = timeline.joinset.try_join_next() {
                debug!(?res, "a layer actor exited, should not happen");
@@ -316,7 +255,7 @@ async fn layer_actor(
                    .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
                    .await
                    .unwrap();
-                live_stats.download_done(layer.layer_file_size());
+                live_stats.download_done();
                did_it
            }
        };
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -14,7 +14,6 @@ mod util {

 /// The pagebench CLI sub-commands, dispatched in [`main`] below.
 mod cmd {
-    pub(super) mod aux_files;
    pub(super) mod basebackup;
    pub(super) mod getpage_latest_lsn;
    pub(super) mod ondemand_download_churn;
@@ -28,7 +27,6 @@ enum Args {
    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
    OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
-    AuxFiles(cmd::aux_files::Args),
 }

 fn main() {
@@ -48,7 +46,6 @@ fn main() {
            cmd::trigger_initial_size_calculation::main(args)
        }
        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
-        Args::AuxFiles(args) => cmd::aux_files::main(args),
    }
    .unwrap()
 }
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,39 +1,15 @@
-use std::sync::Arc;
-
-use ::metrics::IntGauge;
 use bytes::{Buf, BufMut, Bytes};
 use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
 use tracing::warn;

-// BEGIN Copyright (c) 2017 Servo Contributors
-
-/// Const version of FNV hash.
-#[inline]
-#[must_use]
-pub const fn fnv_hash(bytes: &[u8]) -> u128 {
-    const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d;
-    const PRIME: u128 = 0x0000000001000000000000000000013B;
-
-    let mut hash = INITIAL_STATE;
-    let mut i = 0;
-    while i < bytes.len() {
-        hash ^= bytes[i] as u128;
-        hash = hash.wrapping_mul(PRIME);
-        i += 1;
-    }
-    hash
-}
-
-// END Copyright (c) 2017 Servo Contributors
-
-/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash].
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
 fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
-    let mut key: [u8; 16] = [0; METADATA_KEY_SIZE];
-    let hash = fnv_hash(data).to_be_bytes();
+    let mut key = [0; METADATA_KEY_SIZE];
+    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
    key[0] = AUX_KEY_PREFIX;
    key[1] = dir_level1;
    key[2] = dir_level2;
-    key[3..16].copy_from_slice(&hash[3..16]);
+    key[3..16].copy_from_slice(&hash[0..13]);
    Key::from_metadata_key_fixed_size(&key)
 }

@@ -164,55 +140,6 @@ pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
    Ok(encoded)
 }

-/// An estimation of the size of aux files.
-pub struct AuxFileSizeEstimator {
-    aux_file_size_gauge: IntGauge,
-    size: Arc<std::sync::Mutex<Option<isize>>>,
-}
-
-impl AuxFileSizeEstimator {
-    pub fn new(aux_file_size_gauge: IntGauge) -> Self {
-        Self {
-            aux_file_size_gauge,
-            size: Arc::new(std::sync::Mutex::new(None)),
-        }
-    }
-
-    pub fn on_base_backup(&self, new_size: usize) {
-        let mut guard = self.size.lock().unwrap();
-        *guard = Some(new_size as isize);
-        self.report(new_size as isize);
-    }
-
-    pub fn on_add(&self, file_size: usize) {
-        let mut guard = self.size.lock().unwrap();
-        if let Some(size) = &mut *guard {
-            *size += file_size as isize;
-            self.report(*size);
-        }
-    }
-
-    pub fn on_remove(&self, file_size: usize) {
-        let mut guard = self.size.lock().unwrap();
-        if let Some(size) = &mut *guard {
-            *size -= file_size as isize;
-            self.report(*size);
-        }
-    }
-
-    pub fn on_update(&self, old_size: usize, new_size: usize) {
-        let mut guard = self.size.lock().unwrap();
-        if let Some(size) = &mut *guard {
-            *size += new_size as isize - old_size as isize;
-            self.report(*size);
-        }
-    }
-
-    pub fn report(&self, size: isize) {
-        self.aux_file_size_gauge.set(size as i64);
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -221,19 +148,15 @@ mod tests {
    fn test_hash_portable() {
        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
        // if the algorithm produces the same hash across different environments.
-
        assert_eq!(
-            265160408618497461376862998434862070044,
-            super::fnv_hash("test1".as_bytes())
+            305317690835051308206966631765527126151,
+            twox_hash::xxh3::hash128("test1".as_bytes())
        );
        assert_eq!(
-            295486155126299629456360817749600553988,
-            super::fnv_hash("test/test2".as_bytes())
-        );
-        assert_eq!(
-            144066263297769815596495629667062367629,
-            super::fnv_hash("".as_bytes())
+            85104974691013376326742244813280798847,
+            twox_hash::xxh3::hash128("test/test2".as_bytes())
        );
+        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
    }

    #[test]
@@ -241,28 +164,28 @@ mod tests {
        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
        // of the page server.
        assert_eq!(
-            "62000001017F8B83D94F7081693471ABF91C",
-            encode_aux_file_key("pg_logical/mappings/test1").to_string(),
+            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string()
        );
        assert_eq!(
-            "62000001027F8E83D94F7081693471ABFCCD",
-            encode_aux_file_key("pg_logical/snapshots/test2").to_string(),
+            "620000010239AAC544893139B26F501B97E6",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
        );
        assert_eq!(
-            "62000001032E07BB014262B821756295C58D",
-            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(),
+            "620000010300000000000000000000000000",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
        );
        assert_eq!(
-            "62000001FF4F38E1C74754E7D03C1A660178",
-            encode_aux_file_key("pg_logical/unsupported").to_string(),
+            "62000001FF8635AF2134B7266EC5B4189FD6",
+            encode_aux_file_key("pg_logical/unsupported").to_string()
        );
        assert_eq!(
-            "62000002017F8D83D94F7081693471ABFB92",
+            "6200000201772D0E5D71DE14DA86142A1619",
            encode_aux_file_key("pg_replslot/test3").to_string()
        );
        assert_eq!(
-            "620000FFFF2B6ECC8AEF93F643DC44F15E03",
-            encode_aux_file_key("other_file_not_supported").to_string(),
+            "620000FFFF1866EBEB53B807B26A2416F317",
+            encode_aux_file_key("other_file_not_supported").to_string()
        );
    }

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -284,6 +284,7 @@ fn start_pageserver(
    ))
    .unwrap();
    pageserver::preinitialize_metrics();
+    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
@@ -382,7 +383,7 @@ fn start_pageserver(
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    // Set up remote storage client
-    let remote_storage = create_remote_storage_client(conf)?;
+    let remote_storage = Some(create_remote_storage_client(conf)?);

    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -515,12 +516,16 @@ fn start_pageserver(
        }
    });

-    let secondary_controller = secondary::spawn_tasks(
-        tenant_manager.clone(),
-        remote_storage.clone(),
-        background_jobs_barrier.clone(),
-        shutdown_pageserver.clone(),
-    );
+    let secondary_controller = if let Some(remote_storage) = &remote_storage {
+        secondary::spawn_tasks(
+            tenant_manager.clone(),
+            remote_storage.clone(),
+            background_jobs_barrier.clone(),
+            shutdown_pageserver.clone(),
+        )
+    } else {
+        secondary::null_controller()
+    };

    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -528,13 +533,15 @@ fn start_pageserver(
    // been configured.
    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();

-    launch_disk_usage_global_eviction_task(
-        conf,
-        remote_storage.clone(),
-        disk_usage_eviction_state.clone(),
-        tenant_manager.clone(),
-        background_jobs_barrier.clone(),
-    )?;
+    if let Some(remote_storage) = &remote_storage {
+        launch_disk_usage_global_eviction_task(
+            conf,
+            remote_storage.clone(),
+            disk_usage_eviction_state.clone(),
+            tenant_manager.clone(),
+            background_jobs_barrier.clone(),
+        )?;
+    }

    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
@@ -647,20 +654,17 @@ fn start_pageserver(
            None,
            "libpq endpoint listener",
            true,
-            {
-                let tenant_manager = tenant_manager.clone();
-                async move {
-                    page_service::libpq_listener_main(
-                        tenant_manager,
-                        broker_client,
-                        pg_auth,
-                        pageserver_listener,
-                        conf.pg_auth_type,
-                        libpq_ctx,
-                        task_mgr::shutdown_token(),
-                    )
-                    .await
-                }
+            async move {
+                page_service::libpq_listener_main(
+                    conf,
+                    broker_client,
+                    pg_auth,
+                    pageserver_listener,
+                    conf.pg_auth_type,
+                    libpq_ctx,
+                    task_mgr::shutdown_token(),
+                )
+                .await
            },
        );
    }
@@ -689,7 +693,14 @@ fn start_pageserver(
            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
            // The plan is to change that over time.
            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
+            let bg_remote_storage = remote_storage.clone();
+            let bg_deletion_queue = deletion_queue.clone();
+            pageserver::shutdown_pageserver(
+                &tenant_manager,
+                bg_remote_storage.map(|_| bg_deletion_queue),
+                0,
+            )
+            .await;
            unreachable!()
        })
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::{models::CompactionAlgorithm, shard::TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
@@ -15,7 +15,7 @@ use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
 use utils::logging::SecretString;

-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::OnceCell;
 use reqwest::Url;
 use std::num::NonZeroUsize;
 use std::str::FromStr;
@@ -99,7 +99,7 @@ pub mod defaults {

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

-    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "async";
+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";

    ///
    /// Default built-in configuration file.
@@ -1067,19 +1067,6 @@ impl PageServerConf {

        conf.default_tenant_conf = t_conf.merge(TenantConf::default());

-        {
-            const VAR_NAME: &str = "NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM";
-            static VAR: Lazy<Option<bool>> = Lazy::new(|| utils::env::var(VAR_NAME));
-            if VAR.unwrap_or(false)
-                && conf.default_tenant_conf.compaction_algorithm.kind
-                    == CompactionAlgorithm::NotSpecified
-            {
-                panic!(
-                        "Unspecified compaction algorithm in default tenant configuration. \
-                        Set the algorithm explicitly in the pageserver.toml's `tenant_config` field or unset the environment variable {VAR_NAME}");
-            }
-        }
-
        Ok(conf)
    }

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -632,7 +632,7 @@ impl DeletionQueue {
    ///
    /// If remote_storage is None, then the returned workers will also be None.
    pub fn new<C>(
-        remote_storage: GenericRemoteStorage,
+        remote_storage: Option<GenericRemoteStorage>,
        control_plane_client: Option<C>,
        conf: &'static PageServerConf,
    ) -> (Self, Option<DeletionQueueWorkers<C>>)
@@ -658,6 +658,23 @@ impl DeletionQueue {
        // longer to flush after Tenants have all been torn down.
        let cancel = CancellationToken::new();

+        let remote_storage = match remote_storage {
+            None => {
+                return (
+                    Self {
+                        client: DeletionQueueClient {
+                            tx,
+                            executor_tx,
+                            lsn_table: lsn_table.clone(),
+                        },
+                        cancel,
+                    },
+                    None,
+                )
+            }
+            Some(r) => r,
+        };
+
        (
            Self {
                client: DeletionQueueClient {
@@ -748,7 +765,7 @@ mod test {
        /// Simulate a pageserver restart by destroying and recreating the deletion queue
        async fn restart(&mut self) {
            let (deletion_queue, workers) = DeletionQueue::new(
-                self.storage.clone(),
+                Some(self.storage.clone()),
                Some(self.mock_control_plane.clone()),
                self.harness.conf,
            );
@@ -858,7 +875,7 @@ mod test {
        let mock_control_plane = MockControlPlane::new();

        let (deletion_queue, worker) = DeletionQueue::new(
-            storage.clone(),
+            Some(storage.clone()),
            Some(mock_control_plane.clone()),
            harness.conf,
        );
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -534,12 +534,18 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    });
                }
                EvictionLayer::Secondary(layer) => {
-                    let file_size = layer.metadata.file_size;
+                    let file_size = layer.metadata.file_size();
+                    let tenant_manager = tenant_manager.clone();

                    js.spawn(async move {
                        layer
                            .secondary_tenant
-                            .evict_layer(layer.timeline_id, layer.name)
+                            .evict_layer(
+                                tenant_manager.get_conf(),
+                                layer.timeline_id,
+                                layer.name,
+                                layer.metadata,
+                            )
                            .await;
                        Ok(file_size)
                    });
@@ -641,7 +647,7 @@ impl EvictionLayer {
    pub(crate) fn get_file_size(&self) -> u64 {
        match self {
            Self::Attached(l) => l.layer_desc().file_size,
-            Self::Secondary(sl) => sl.metadata.file_size,
+            Self::Secondary(sl) => sl.metadata.file_size(),
        }
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -257,37 +257,6 @@ paths:
              schema:
                $ref: "#/components/schemas/LsnByTimestampResponse"

-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-    post:
-      description: Obtain lease for the given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: string
-            format: hex
-          description: A LSN to obtain the lease for
-      responses:
-        "200":
-          description: OK
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/LsnLease"
-
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -612,80 +581,6 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

-  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
-    parameters:
-      - name: tenant_shard_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        ŕequired: true
-        schema:
-          type: string
-
-    put:
-      description: |
-        Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`.
-        Current implementation might not be retryable across failure cases, but will be enhanced in future.
-        Detaching should be expected to be expensive operation. Timeouts should be retried.
-      responses:
-        "200":
-          description: |
-            The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented.
-            If any timelines were deleted after reparenting, they might not be on this list.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/AncestorDetached"
-
-        "400":
-          description: |
-            Number of early checks meaning the timeline cannot be detached now:
-              - the ancestor of timeline has an ancestor: not supported, see RFC
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
-        "404":
-          description: Tenant or timeline not found.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-
-        "409":
-          description: |
-            The timeline can never be detached:
-              - timeline has no ancestor, implying that the timeline has never had an ancestor
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-
-        "500":
-          description: |
-            Transient error, for example, pageserver shutdown happened while
-            processing the request but we were unable to distinguish that. Must
-            be retried.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
-        "503":
-          description: |
-            Temporarily unavailable, please retry. Possible reasons:
-              - another timeline detach for the same tenant is underway, please retry later
-              - detected shutdown error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -1085,15 +980,6 @@ components:
          type: string
          enum: [past, present, future, nodata]

-    LsnLease:
-      type: object
-      required:
-        - valid_until
-      properties:
-        valid_until:
-          type: string
-          format: date-time
-
    PageserverUtilization:
      type: object
      required:
@@ -1151,19 +1037,6 @@ components:
          format: int64
          description: How many bytes of layer content were in the latest layer heatmap

-    AncestorDetached:
-      type: object
-      required:
-        - reparented_timelines
-      properties:
-        reparented_timelines:
-          type: array
-          description: Set of reparented timeline ids
-          properties:
-            type: string
-            format: hex
-            description: TimelineId
-

    Error:
      type: object
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,8 +1,6 @@
 //!
 //! Management HTTP API
 //!
-use std::cmp::Reverse;
-use std::collections::BinaryHeap;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -16,9 +14,6 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
-use pageserver_api::models::AuxFilePolicy;
-use pageserver_api::models::IngestAuxFilesRequest;
-use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
@@ -29,11 +24,7 @@ use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
-use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TenantState;
-use pageserver_api::models::TopTenantShardItem;
-use pageserver_api::models::TopTenantShardsRequest;
-use pageserver_api::models::TopTenantShardsResponse;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
@@ -75,7 +66,6 @@ use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::GetTimelineError;
 use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -114,7 +104,7 @@ pub struct State {
    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: Vec<Uri>,
-    remote_storage: GenericRemoteStorage,
+    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
@@ -128,7 +118,7 @@ impl State {
        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
-        remote_storage: GenericRemoteStorage,
+        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
@@ -281,13 +271,6 @@ impl From<GetTenantError> for ApiError {
    }
 }

-impl From<GetTimelineError> for ApiError {
-    fn from(gte: GetTimelineError) -> Self {
-        // Rationale: tenant is activated only after eligble timelines activate
-        ApiError::NotFound(gte.into())
-    }
-}
-
 impl From<GetActiveTenantError> for ApiError {
    fn from(e: GetActiveTenantError) -> ApiError {
        match e {
@@ -395,7 +378,7 @@ async fn build_timeline_info_common(
        let guard = timeline.last_received_wal.lock().unwrap();
        if let Some(info) = guard.as_ref() {
            (
-                Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
+                Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only.
                Some(info.last_received_msg_lsn),
                Some(info.last_received_msg_ts),
            )
@@ -450,8 +433,6 @@ async fn build_timeline_info_common(
        state,

        walreceiver_status,
-
-        last_aux_file_policy: timeline.last_aux_file_policy.load(),
    };
    Ok(info)
 }
@@ -652,7 +633,9 @@ async fn timeline_preserve_initdb_handler(
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;

-        let timeline = tenant.get_timeline(timeline_id, false)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(|e| ApiError::NotFound(e.into()))?;

        timeline
            .preserve_initdb_archive()
@@ -694,7 +677,9 @@ async fn timeline_detail_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        let timeline = tenant.get_timeline(timeline_id, false)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(|e| ApiError::NotFound(e.into()))?;

        let timeline_info = build_timeline_info(
            &timeline,
@@ -828,6 +813,12 @@ async fn tenant_attach_handler(

    let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;

+    if state.remote_storage.is_none() {
+        return Err(ApiError::BadRequest(anyhow!(
+            "attach_tenant is not possible because pageserver was configured without remote storage"
+        )));
+    }
+
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
    let shard_params = ShardParameters::default();
    let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
@@ -1652,6 +1643,12 @@ async fn tenant_time_travel_remote_storage_handler(
        )));
    }

+    let Some(storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run time travel"
+        )));
+    };
+
    if timestamp > done_if_after {
        return Err(ApiError::BadRequest(anyhow!(
            "The done_if_after timestamp comes before the timestamp to recover to"
@@ -1661,7 +1658,7 @@ async fn tenant_time_travel_remote_storage_handler(
    tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");

    remote_timeline_client::upload::time_travel_recover_tenant(
-        &state.remote_storage,
+        storage,
        &tenant_shard_id,
        timestamp,
        done_if_after,
@@ -1706,32 +1703,6 @@ async fn handle_tenant_break(
    json_response(StatusCode::OK, ())
 }

-// Obtains an lsn lease on the given timeline.
-async fn lsn_lease_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-
-    let state = get_state(&request);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-    let result = timeline
-        .make_lsn_lease(lsn, &ctx)
-        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
-
-    json_response(StatusCode::OK, result)
-}
-
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
    mut request: Request<Body>,
@@ -1744,7 +1715,12 @@ async fn timeline_gc_handler(
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
+    let wait_task_done = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)?;
+    let gc_result = wait_task_done
+        .await
+        .context("wait for gc task")
+        .map_err(ApiError::InternalServerError)?
+        .map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, gc_result)
 }
@@ -1767,8 +1743,6 @@ async fn timeline_compact_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
-    let wait_until_uploaded =
-        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1777,9 +1751,6 @@ async fn timeline_compact_handler(
            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
-        }
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1804,8 +1775,6 @@ async fn timeline_checkpoint_handler(
    if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
        flags |= CompactFlags::ForceImageLayerCreation;
    }
-    let wait_until_uploaded =
-        parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -1819,10 +1788,6 @@ async fn timeline_checkpoint_handler(
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
-        }
-
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -1906,11 +1871,14 @@ async fn timeline_detach_ancestor_handler(
        let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
        let ctx = &ctx;

-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| ApiError::NotFound(e.into()))?;

        let (_guard, prepared) = timeline
            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
-            .await?;
+            .await
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        let res = state
            .tenant_manager
@@ -1940,6 +1908,11 @@ async fn deletion_queue_flush(
 ) -> Result<Response<Body>, ApiError> {
    let state = get_state(&r);

+    if state.remote_storage.is_none() {
+        // Nothing to do if remote storage is disabled.
+        return json_response(StatusCode::OK, ());
+    }
+
    let execute = parse_query_param(&r, "execute")?.unwrap_or(false);

    let flush = async {
@@ -2044,7 +2017,9 @@ async fn active_timeline_of_active_tenant(

    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-    Ok(tenant.get_timeline(timeline_id, true)?)
+    tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| ApiError::NotFound(e.into()))
 }

 async fn always_panic_handler(
@@ -2102,11 +2077,18 @@ async fn disk_usage_eviction_run(
    };

    let state = get_state(&r);
+
+    let Some(storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run eviction iteration"
+        )));
+    };
+
    let eviction_state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
        &eviction_state,
-        &state.remote_storage,
+        storage,
        usage,
        &state.tenant_manager,
        config.eviction_order,
@@ -2143,23 +2125,29 @@ async fn tenant_scan_remote_handler(
    let state = get_state(&request);
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;

+    let Some(remote_storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Remote storage not configured"
+        )));
+    };
+
    let mut response = TenantScanRemoteStorageResponse::default();

    let (shards, _other_keys) =
-        list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone())
+        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
            .await
            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

    for tenant_shard_id in shards {
        let (timeline_ids, _other_keys) =
-            list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone())
+            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
                .await
                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

        let mut generation = Generation::none();
        for timeline_id in timeline_ids {
            match download_index_part(
-                &state.remote_storage,
+                remote_storage,
                &tenant_shard_id,
                &timeline_id,
                Generation::MAX,
@@ -2308,31 +2296,6 @@ async fn post_tracing_event_handler(
    json_response(StatusCode::OK, ())
 }

-async fn force_aux_policy_switch_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
-    let policy: AuxFilePolicy = json_request(&mut r).await?;
-
-    let state = get_state(&r);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-    timeline
-        .do_switch_aux_policy(policy)
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn put_io_engine_handler(
    mut r: Request<Body>,
    _cancel: CancellationToken,
@@ -2395,150 +2358,6 @@ async fn get_utilization(
        .map_err(ApiError::InternalServerError)
 }

-async fn list_aux_files(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let body: ListAuxFilesRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let state = get_state(&request);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let files = timeline.list_aux_files(body.lsn, &ctx).await?;
-    json_response(StatusCode::OK, files)
-}
-
-async fn ingest_aux_files(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let body: IngestAuxFilesRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let state = get_state(&request);
-
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-
-    let mut modification = timeline.begin_modification(
-        Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */
-    );
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    for (fname, content) in body.aux_files {
-        modification
-            .put_file(&fname, content.as_bytes(), &ctx)
-            .await
-            .map_err(ApiError::InternalServerError)?;
-    }
-    modification
-        .commit(&ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
-/// Report on the largest tenants on this pageserver, for the storage controller to identify
-/// candidates for splitting
-async fn post_top_tenants(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let request: TopTenantShardsRequest = json_request(&mut r).await?;
-    let state = get_state(&r);
-
-    fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 {
-        match order_by {
-            TenantSorting::ResidentSize => sizes.resident_size,
-            TenantSorting::MaxLogicalSize => sizes.max_logical_size,
-        }
-    }
-
-    #[derive(Eq, PartialEq)]
-    struct HeapItem {
-        metric: u64,
-        sizes: TopTenantShardItem,
-    }
-
-    impl PartialOrd for HeapItem {
-        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-            Some(self.cmp(other))
-        }
-    }
-
-    /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which
-    /// supports popping the greatest item but not the smallest.
-    impl Ord for HeapItem {
-        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-            Reverse(self.metric).cmp(&Reverse(other.metric))
-        }
-    }
-
-    let mut top_n: BinaryHeap<HeapItem> = BinaryHeap::with_capacity(request.limit);
-
-    // FIXME: this is a lot of clones to take this tenant list
-    for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() {
-        if let Some(shards_lt) = request.where_shards_lt {
-            // Ignore tenants which already have >= this many shards
-            if tenant_shard_id.shard_count >= shards_lt {
-                continue;
-            }
-        }
-
-        let sizes = match tenant_slot {
-            TenantSlot::Attached(tenant) => tenant.get_sizes(),
-            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
-                continue;
-            }
-        };
-        let metric = get_size_metric(&sizes, &request.order_by);
-
-        if let Some(gt) = request.where_gt {
-            // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work
-            if metric <= gt {
-                continue;
-            }
-        };
-
-        match top_n.peek() {
-            None => {
-                // Top N list is empty: candidate becomes first member
-                top_n.push(HeapItem { metric, sizes });
-            }
-            Some(i) if i.metric > metric && top_n.len() < request.limit => {
-                // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end
-                top_n.push(HeapItem { metric, sizes });
-            }
-            Some(i) if i.metric > metric => {
-                // List is at limit and lowest value is greater than our candidate, drop it.
-            }
-            Some(_) => top_n.push(HeapItem { metric, sizes }),
-        }
-
-        while top_n.len() > request.limit {
-            top_n.pop();
-        }
-    }
-
-    json_response(
-        StatusCode::OK,
-        TopTenantShardsResponse {
-            shards: top_n.into_iter().map(|i| i.sizes).collect(),
-        },
-    )
-}
-
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2751,10 +2570,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
-            |r| api_handler(r, lsn_lease_handler),
-        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
            |r| api_handler(r, timeline_gc_handler),
@@ -2828,19 +2643,6 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
-            |r| api_handler(r, force_aux_policy_switch_handler),
-        )
        .get("/v1/utilization", |r| api_handler(r, get_utilization))
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
-            |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
-        )
-        .post(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files",
-            |r| testing_api_handler("list_aux_files", r, list_aux_files),
-        )
-        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -57,7 +57,7 @@ pub use crate::metrics::preinitialize_metrics;
 #[tracing::instrument(skip_all, fields(%exit_code))]
 pub async fn shutdown_pageserver(
    tenant_manager: &TenantManager,
-    mut deletion_queue: DeletionQueue,
+    deletion_queue: Option<DeletionQueue>,
    exit_code: i32,
 ) {
    use std::time::Duration;
@@ -89,7 +89,9 @@ pub async fn shutdown_pageserver(
    .await;

    // Best effort to persist any outstanding deletions, to avoid leaking objects
-    deletion_queue.shutdown(Duration::from_secs(5)).await;
+    if let Some(mut deletion_queue) = deletion_queue {
+        deletion_queue.shutdown(Duration::from_secs(5)).await;
+    }

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
@@ -112,6 +114,10 @@ pub async fn shutdown_pageserver(
    std::process::exit(exit_code);
 }

+/// The name of the metadata file pageserver creates per timeline.
+/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
+pub const METADATA_FILE_NAME: &str = "metadata";
+
 /// Per-tenant configuration file.
 /// Full path: `tenants/<tenant_id>/config`.
 pub(crate) const TENANT_CONFIG_NAME: &str = "config";
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,15 +525,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_standby_horizon",
-        "Standby apply LSN for which GC is hold off, by timeline.",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
@@ -594,15 +585,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-static AUX_FILE_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_aux_file_estimated_size",
-        "The size of all aux files for a timeline in aux file v2 store.",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
@@ -1867,6 +1849,7 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
+    pub(crate) time_spent_on_ingest: Histogram,
 }

 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -1890,6 +1873,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
+    time_spent_on_ingest: register_histogram!(
+        "pageserver_wal_ingest_put_value_seconds",
+        "Actual time spent on ingesting a record",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric"),
 });

 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
@@ -2001,6 +1990,29 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

+#[cfg(not(test))]
+pub mod wal_redo {
+    use super::*;
+
+    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
+        std::sync::Mutex::new(
+            register_uint_gauge_vec!(
+                "pageserver_wal_redo_process_kind",
+                "The configured process kind for walredo",
+                &["kind"],
+            )
+            .unwrap(),
+        )
+    });
+
+    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
+        // use guard to avoid races around the next two steps
+        let guard = PROCESS_KIND.lock().unwrap();
+        guard.reset();
+        guard.with_label_values(&[&format!("{kind}")]).set(1);
+    }
+}
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -2100,11 +2112,9 @@ pub(crate) struct TimelineMetrics {
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub standby_horizon_gauge: IntGauge,
-    pub resident_physical_size_gauge: UIntGauge,
+    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
-    pub aux_file_size_gauge: IntGauge,
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
@@ -2170,9 +2180,6 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-        let standby_horizon_gauge = STANDBY_HORIZON
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2180,9 +2187,6 @@ impl TimelineMetrics {
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
-        let aux_file_size_gauge = AUX_FILE_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
        let directory_entries_count_gauge_closure = {
            let tenant_shard_id = *tenant_shard_id;
@@ -2218,10 +2222,8 @@ impl TimelineMetrics {
            find_gc_cutoffs_histo,
            load_layer_map_histo,
            last_record_gauge,
-            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
-            aux_file_size_gauge,
            directory_entries_count_gauge,
            evictions,
            evictions_with_low_residence_duration: std::sync::RwLock::new(
@@ -2253,7 +2255,6 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2263,7 +2264,6 @@ impl TimelineMetrics {
            let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
        let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -2320,7 +2320,6 @@ use pin_project_lite::pin_project;
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
-use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
@@ -2330,35 +2329,35 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;

 /// Maintain a per timeline gauge in addition to the global gauge.
-pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
-    last_set: AtomicU64,
+struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: u64,
    gauge: UIntGauge,
 }

 impl PerTimelineRemotePhysicalSizeGauge {
    fn new(per_timeline_gauge: UIntGauge) -> Self {
        Self {
-            last_set: AtomicU64::new(0),
+            last_set: per_timeline_gauge.get(),
            gauge: per_timeline_gauge,
        }
    }
-    pub(crate) fn set(&self, sz: u64) {
+    fn set(&mut self, sz: u64) {
        self.gauge.set(sz);
-        let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed);
-        if sz < prev {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz);
+        if sz < self.last_set {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
        } else {
-            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev);
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
        };
+        self.last_set = sz;
    }
-    pub(crate) fn get(&self) -> u64 {
+    fn get(&self) -> u64 {
        self.gauge.get()
    }
 }

 impl Drop for PerTimelineRemotePhysicalSizeGauge {
    fn drop(&mut self) {
-        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed));
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
    }
 }

@@ -2366,7 +2365,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
    tenant_id: String,
    shard_id: String,
    timeline_id: String,
-    pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge,
+    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -2374,27 +2373,38 @@ pub(crate) struct RemoteTimelineClientMetrics {

 impl RemoteTimelineClientMetrics {
    pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
-        let tenant_id_str = tenant_shard_id.tenant_id.to_string();
-        let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
-        let timeline_id_str = timeline_id.to_string();
-
-        let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new(
-            REMOTE_PHYSICAL_SIZE
-                .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
-                .unwrap(),
-        );
-
        RemoteTimelineClientMetrics {
-            tenant_id: tenant_id_str,
-            shard_id: shard_id_str,
-            timeline_id: timeline_id_str,
+            tenant_id: tenant_shard_id.tenant_id.to_string(),
+            shard_id: format!("{}", tenant_shard_id.shard_slug()),
+            timeline_id: timeline_id.to_string(),
            calls: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
-            remote_physical_size_gauge,
+            remote_physical_size_gauge: Mutex::new(None),
        }
    }

+    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
+        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
+        let gauge = guard.get_or_insert_with(|| {
+            PerTimelineRemotePhysicalSizeGauge::new(
+                REMOTE_PHYSICAL_SIZE
+                    .get_metric_with_label_values(&[
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
+                    ])
+                    .unwrap(),
+            )
+        });
+        gauge.set(sz);
+    }
+
+    pub(crate) fn remote_physical_size_get(&self) -> u64 {
+        let guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
+    }
+
    pub fn remote_operation_time(
        &self,
        file_kind: &RemoteOpFileKind,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,7 +19,6 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
-use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -33,8 +32,6 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
-use std::time::Instant;
-use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
@@ -52,6 +49,7 @@ use utils::{
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
+use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
@@ -61,15 +59,13 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::mgr;
+use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::GetTenantError;
-use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
-use crate::tenant::mgr::TenantManager;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
-use crate::tenant::Tenant;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 use pageserver_api::key::rel_block_to_key;
@@ -139,7 +135,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 /// Listens for connections, and launches a new handler task for each.
 ///
 pub async fn libpq_listener_main(
-    tenant_manager: Arc<TenantManager>,
+    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
@@ -184,7 +180,7 @@ pub async fn libpq_listener_main(
                    "serving compute connection task",
                    false,
                    page_service_conn_main(
-                        tenant_manager.clone(),
+                        conf,
                        broker_client.clone(),
                        local_auth,
                        socket,
@@ -207,7 +203,7 @@ pub async fn libpq_listener_main(

 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
-    tenant_manager: Arc<TenantManager>,
+    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
@@ -260,14 +256,11 @@ async fn page_service_conn_main(
    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

-    fail::fail_point!("ps::connection-start::pre-login");
-
    // XXX: pgbackend.run() should take the connection_ctx,
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler =
-        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -298,12 +291,11 @@ struct HandlerTimeline {
 }

 struct PageServerHandler {
+    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

-    tenant_manager: Arc<TenantManager>,
-
    /// The context created for the lifetime of the connection
    /// services by this PageServerHandler.
    /// For each query received over the connection,
@@ -389,13 +381,13 @@ impl From<WaitLsnError> for QueryError {

 impl PageServerHandler {
    pub fn new(
-        tenant_manager: Arc<TenantManager>,
+        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
-            tenant_manager,
+            _conf: conf,
            broker_client,
            auth,
            claims: None,
@@ -560,9 +552,13 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();

-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
-            .await?;
+        let tenant = mgr::get_active_tenant_with_timeout(
+            tenant_id,
+            ShardSelector::First,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await?;

        // Make request tracer if needed
        let mut tracer = if tenant.get_trace_read_requests() {
@@ -605,7 +601,6 @@ impl PageServerHandler {
            };

            trace!("query: {copy_data_bytes:?}");
-            fail::fail_point!("ps::handle-pagerequest-message");

            // Trace request if needed
            if let Some(t) = tracer.as_mut() {
@@ -620,7 +615,6 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::exists");
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
@@ -630,7 +624,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
@@ -640,7 +633,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
                    // shard_id is filled in by the handler
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
                    (
@@ -651,7 +643,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
@@ -661,7 +652,6 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
-                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
@@ -736,9 +726,13 @@ impl PageServerHandler {

        // Create empty timeline
        info!("creating new timeline");
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
-            .await?;
+        let tenant = get_active_tenant_with_timeout(
+            tenant_id,
+            ShardSelector::Zero,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
@@ -915,39 +909,6 @@ impl PageServerHandler {
        }
    }

-    #[instrument(skip_all, fields(shard_id, %lsn))]
-    async fn handle_make_lsn_lease<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        let shard_selector = ShardSelector::Known(tenant_shard_id.to_index());
-        let timeline = self
-            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
-            .await?;
-        let lease = timeline.make_lsn_lease(lsn, ctx)?;
-        let valid_until = lease
-            .valid_until
-            .duration_since(SystemTime::UNIX_EPOCH)
-            .map_err(|e| QueryError::Other(e.into()))?;
-
-        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
-            b"valid_until",
-        )]))?
-        .write_message_noflush(&BeMessage::DataRow(&[Some(
-            &valid_until.as_millis().to_be_bytes(),
-        )]))?
-        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-
-        Ok(())
-    }
-
    #[instrument(skip_all, fields(shard_id))]
    async fn handle_get_rel_exists_request(
        &mut self,
@@ -1409,69 +1370,18 @@ impl PageServerHandler {
        timeline_id: TimelineId,
        selector: ShardSelector,
    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT)
-            .await
-            .map_err(GetActiveTimelineError::Tenant)?;
+        let tenant = get_active_tenant_with_timeout(
+            tenant_id,
+            selector,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
        let timeline = tenant.get_timeline(timeline_id, true)?;
        set_tracing_field_shard_id(&timeline);
        Ok(timeline)
    }
-
-    /// Get a shard's [`Tenant`] in its active state, if present.  If we don't find the shard and some
-    /// slots for this tenant are `InProgress` then we will wait.
-    /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait.
-    ///
-    /// `timeout` is used as a total timeout for the whole wait operation.
-    async fn get_active_tenant_with_timeout(
-        &self,
-        tenant_id: TenantId,
-        shard_selector: ShardSelector,
-        timeout: Duration,
-    ) -> Result<Arc<Tenant>, GetActiveTenantError> {
-        let wait_start = Instant::now();
-        let deadline = wait_start + timeout;
-
-        // Resolve TenantId to TenantShardId.  This is usually a quick one-shot thing, the loop is
-        // for handling the rare case that the slot we're accessing is InProgress.
-        let tenant_shard = loop {
-            let resolved = self
-                .tenant_manager
-                .resolve_attached_shard(&tenant_id, shard_selector);
-            match resolved {
-                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
-                ShardResolveResult::NotFound => {
-                    return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                        tenant_id,
-                    )));
-                }
-                ShardResolveResult::InProgress(barrier) => {
-                    // We can't authoritatively answer right now: wait for InProgress state
-                    // to end, then try again
-                    tokio::select! {
-                        _ = self.await_connection_cancelled() => {
-                            return Err(GetActiveTenantError::Cancelled)
-                        },
-                        _  = barrier.wait() => {
-                            // The barrier completed: proceed around the loop to try looking up again
-                        },
-                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
-                            return Err(GetActiveTenantError::WaitForActiveTimeout {
-                                latest_state: None,
-                                wait_time: timeout,
-                            });
-                        }
-                    }
-                }
-            };
-        };
-
-        tracing::debug!("Waiting for tenant to enter active state...");
-        tenant_shard
-            .wait_to_become_active(deadline.duration_since(Instant::now()))
-            .await?;
-        Ok(tenant_shard)
-    }
 }

 #[async_trait::async_trait]
@@ -1513,7 +1423,6 @@ where
        _pgb: &mut PostgresBackend<IO>,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
-        fail::fail_point!("ps::connection-start::startup-packet");
        Ok(())
    }

@@ -1528,12 +1437,11 @@ where
            Err(QueryError::SimulatedConnectionError)
        });

-        fail::fail_point!("ps::connection-start::process-query");
-
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-        let parts = query_string.split_whitespace().collect::<Vec<_>>();
-        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
+        if query_string.starts_with("pagestream_v2 ") {
+            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for pagestream command"
@@ -1558,7 +1466,9 @@ where
                ctx,
            )
            .await?;
-        } else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
+        } else if query_string.starts_with("pagestream ") {
+            let (_, params_raw) = query_string.split_at("pagestream ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for pagestream command"
@@ -1583,7 +1493,10 @@ where
                ctx,
            )
            .await?;
-        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
+        } else if query_string.starts_with("basebackup ") {
+            let (_, params_raw) = query_string.split_at("basebackup ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
            if params.len() < 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for basebackup command"
@@ -1601,23 +1514,26 @@ where

            self.check_permission(Some(tenant_id))?;

-            let lsn = if let Some(lsn_str) = params.get(2) {
+            let lsn = if params.len() >= 3 {
                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
                )
            } else {
                None
            };

-            let gzip = match params.get(3) {
-                Some(&"--gzip") => true,
-                None => false,
-                Some(third_param) => {
+            let gzip = if params.len() >= 4 {
+                if params[3] == "--gzip" {
+                    true
+                } else {
                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {third_param}",
-                    )))
+                        "Parameter in position 3 unknown {}",
+                        params[3],
+                    )));
                }
+            } else {
+                false
            };

            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
@@ -1641,7 +1557,10 @@ where
            res?;
        }
        // return pair of prev_lsn and last_lsn
-        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
+        else if query_string.starts_with("get_last_record_rlsn ") {
+            let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
            if params.len() != 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for get_last_record_rlsn command"
@@ -1683,7 +1602,10 @@ where
            .await?;
        }
        // same as basebackup, but result includes relational data as well
-        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
+        else if query_string.starts_with("fullbackup ") {
+            let (_, params_raw) = query_string.split_at("fullbackup ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
+
            if params.len() < 2 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for fullbackup command"
@@ -1700,18 +1622,18 @@ where
                .record("timeline_id", field::display(timeline_id));

            // The caller is responsible for providing correct lsn and prev_lsn.
-            let lsn = if let Some(lsn_str) = params.get(2) {
+            let lsn = if params.len() > 2 {
                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+                    Lsn::from_str(params[2])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
                )
            } else {
                None
            };
-            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
+            let prev_lsn = if params.len() > 3 {
                Some(
-                    Lsn::from_str(prev_lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
+                    Lsn::from_str(params[3])
+                        .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?,
                )
            } else {
                None
@@ -1744,7 +1666,8 @@ where
            // 2. Run:
            // cat my_backup/base.tar | psql -h $PAGESERVER \
            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let params = &parts[2..];
+            let (_, params_raw) = query_string.split_at("import basebackup ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
            if params.len() != 5 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for import basebackup command"
@@ -1793,7 +1716,8 @@ where
            //
            // Files are scheduled to be persisted to remote storage, and the
            // caller should poll the http api to check when that is done.
-            let params = &parts[2..];
+            let (_, params_raw) = query_string.split_at("import wal ".len());
+            let params = params_raw.split_whitespace().collect::<Vec<_>>();
            if params.len() != 4 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for import wal command"
@@ -1831,45 +1755,10 @@ where
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("lease lsn ") {
-            let params = &parts[2..];
-            if params.len() != 3 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number {} for lease lsn command",
-                    params.len()
-                )));
-            }
-
-            let tenant_shard_id = TenantShardId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_shard_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_shard_id.tenant_id))?;
-
-            // The caller is responsible for providing correct lsn.
-            let lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-
-            match self
-                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error obtaining lsn lease for {lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else if let Some(params) = parts.strip_prefix(&["show"]) {
+        } else if query_string.starts_with("show ") {
            // show <tenant_id>
+            let (_, params_raw) = query_string.split_at("show ".len());
+            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 1 {
                return Err(QueryError::Other(anyhow::anyhow!(
                    "invalid param number for config command"
@@ -1882,13 +1771,13 @@ where

            self.check_permission(Some(tenant_id))?;

-            let tenant = self
-                .get_active_tenant_with_timeout(
-                    tenant_id,
-                    ShardSelector::Zero,
-                    ACTIVE_TENANT_TIMEOUT,
-                )
-                .await?;
+            let tenant = get_active_tenant_with_timeout(
+                tenant_id,
+                ShardSelector::Zero,
+                ACTIVE_TENANT_TIMEOUT,
+                &task_mgr::shutdown_token(),
+            )
+            .await?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,6 +9,7 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
+use crate::metrics::WAL_INGEST;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
 use crate::{aux_file, repository::*};
@@ -34,16 +35,12 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, trace, warn};
+use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

-/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
-pub const MAX_AUX_FILE_DELTAS: usize = 1024;
-
-/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
-pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
+const MAX_AUX_FILE_DELTAS: usize = 1024;

 #[derive(Debug)]
 pub enum LsnForTimestamp {
@@ -702,17 +699,13 @@ impl Timeline {
            .await
            .context("scan")?;
        let mut result = HashMap::new();
-        let mut sz = 0;
        for (_, v) in kv {
            let v = v.context("get value")?;
            let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
            for (fname, content) in v {
-                sz += fname.len();
-                sz += content.len();
                result.insert(fname, content);
            }
        }
-        self.aux_file_size_estimator.on_base_backup(sz);
        Ok(result)
    }

@@ -721,11 +714,10 @@ impl Timeline {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        match current_policy {
-            Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await,
-            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
-            Some(AuxFilePolicy::CrossValidation) => {
+        match self.get_switch_aux_file_policy() {
+            AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
+            AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
+            AuxFilePolicy::CrossValidation => {
                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
                match (v1_result, v2_result) {
@@ -1473,40 +1465,7 @@ impl<'a> DatadirModification<'a> {
        content: &[u8],
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let switch_policy = self.tline.get_switch_aux_file_policy();
-
-        let policy = {
-            let current_policy = self.tline.last_aux_file_policy.load();
-            // Allowed switch path:
-            // * no aux files -> v1/v2/cross-validation
-            // * cross-validation->v2
-
-            let current_policy = if current_policy.is_none() {
-                // This path will only be hit once per tenant: we will decide the final policy in this code block.
-                // The next call to `put_file` will always have `last_aux_file_policy != None`.
-                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
-                if aux_files_key_v1.is_empty() {
-                    None
-                } else {
-                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
-                    Some(AuxFilePolicy::V1)
-                }
-            } else {
-                current_policy
-            };
-
-            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.do_switch_aux_policy(switch_policy)?;
-                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
-                switch_policy
-            } else {
-                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
-                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
-                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
-            }
-        };
-
+        let policy = self.tline.get_switch_aux_file_policy();
        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
            let key = aux_file::encode_aux_file_key(path);
            // retrieve the key from the engine
@@ -1515,45 +1474,23 @@ impl<'a> DatadirModification<'a> {
                Err(PageReconstructError::MissingKey(_)) => None,
                Err(e) => return Err(e.into()),
            };
-            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
+            let files = if let Some(ref old_val) = old_val {
                aux_file::decode_file_value(old_val)?
            } else {
                Vec::new()
            };
-            let mut other_files = Vec::with_capacity(files.len());
-            let mut modifying_file = None;
-            for file @ (p, content) in files {
-                if path == p {
-                    assert!(
-                        modifying_file.is_none(),
-                        "duplicated entries found for {}",
-                        path
-                    );
-                    modifying_file = Some(content);
-                } else {
-                    other_files.push(file);
-                }
-            }
-            let mut new_files = other_files;
-            match (modifying_file, content.is_empty()) {
-                (Some(old_content), false) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_update(old_content.len(), content.len());
-                    new_files.push((path, content));
-                }
-                (Some(old_content), true) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_remove(old_content.len());
-                    // not adding the file key to the final `new_files` vec.
-                }
-                (None, false) => {
-                    self.tline.aux_file_size_estimator.on_add(content.len());
-                    new_files.push((path, content));
-                }
-                (None, true) => anyhow::bail!("removing non-existing aux file: {}", path),
-            }
+            let new_files = if content.is_empty() {
+                files
+                    .into_iter()
+                    .filter(|(p, _)| &path != p)
+                    .collect::<Vec<_>>()
+            } else {
+                files
+                    .into_iter()
+                    .filter(|(p, _)| &path != p)
+                    .chain(std::iter::once((path, content)))
+                    .collect::<Vec<_>>()
+            };
            let new_val = aux_file::encode_file_value(&new_files)?;
            self.put(key, Value::Image(new_val.into()));
        }
@@ -1714,6 +1651,8 @@ impl<'a> DatadirModification<'a> {
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let mut writer = self.tline.writer().await;

+        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
+
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

@@ -1753,6 +1692,8 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

+        timer.observe_duration();
+
        Ok(())
    }

@@ -1788,12 +1729,6 @@ impl<'a> DatadirModification<'a> {
        self.tline.get(key, lsn, ctx).await
    }

-    /// Only used during unit tests, force putting a key into the modification.
-    #[cfg(test)]
-    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
-        self.put(key, val);
-    }
-
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,13 +238,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                        io_buf,
                        Err(Error::new(
                            ErrorKind::Other,
-                            format!("blob too large ({len} bytes)"),
+                            format!("blob too large ({} bytes)", len),
                        )),
                    );
                }
-                if len > 0x0fff_ffff {
-                    tracing::warn!("writing blob above future limit ({len} bytes)");
-                }
                let mut len_buf = (len as u32).to_be_bytes();
                len_buf[0] |= 0x80;
                io_buf.extend_from_slice(&len_buf[..]);
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,6 @@
 use anyhow::bail;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
-use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
@@ -40,6 +39,8 @@ pub mod defaults {

    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
+    pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
+        super::CompactionAlgorithm::Legacy;

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

@@ -319,7 +320,7 @@ pub struct TenantConf {
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
-    pub compaction_algorithm: CompactionAlgorithmSettings,
+    pub compaction_algorithm: CompactionAlgorithm,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -372,8 +373,6 @@ pub struct TenantConf {

    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
    pub switch_aux_file_policy: AuxFilePolicy,
 }

@@ -405,7 +404,7 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
-    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub compaction_algorithm: Option<CompactionAlgorithm>,

    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
@@ -496,9 +495,7 @@ impl TenantConfOpt {
                .unwrap_or(global_conf.compaction_threshold),
            compaction_algorithm: self
                .compaction_algorithm
-                .as_ref()
-                .unwrap_or(&global_conf.compaction_algorithm)
-                .clone(),
+                .unwrap_or(global_conf.compaction_algorithm),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -551,15 +548,7 @@ impl Default for TenantConf {
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
            compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
-            compaction_algorithm: CompactionAlgorithmSettings {
-                kind: if cfg!(test) {
-                    // Rust tests rely on a valid implicit default (TODO: fix this)
-                    CompactionAlgorithm::Legacy
-                } else {
-                    // Python tests are subject to NotSpecified handling
-                    CompactionAlgorithm::NotSpecified
-                },
-            },
+            compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
@@ -585,7 +574,7 @@ impl Default for TenantConf {
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
+            switch_aux_file_policy: AuxFilePolicy::V1,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -8,7 +8,7 @@ use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, Instrument};

-use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
+use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};

 use crate::{
    config::PageServerConf,
@@ -181,23 +181,25 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del

 async fn remove_tenant_remote_delete_mark(
    conf: &PageServerConf,
-    remote_storage: &GenericRemoteStorage,
+    remote_storage: Option<&GenericRemoteStorage>,
    tenant_shard_id: &TenantShardId,
    cancel: &CancellationToken,
 ) -> Result<(), DeleteTenantError> {
-    let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
-    backoff::retry(
-        || async { remote_storage.delete(&path, cancel).await },
-        TimeoutOrCancel::caused_by_cancel,
-        FAILED_UPLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "remove_tenant_remote_delete_mark",
-        cancel,
-    )
-    .await
-    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-    .and_then(|x| x)
-    .context("remove_tenant_remote_delete_mark")?;
+    if let Some(remote_storage) = remote_storage {
+        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
+        backoff::retry(
+            || async { remote_storage.delete(&path, cancel).await },
+            TimeoutOrCancel::caused_by_cancel,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_tenant_remote_delete_mark",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
+        .and_then(|x| x)
+        .context("remove_tenant_remote_delete_mark")?;
+    }
    Ok(())
 }

@@ -295,7 +297,7 @@ impl DeleteTenantFlow {
    #[instrument(skip_all)]
    pub(crate) async fn run(
        conf: &'static PageServerConf,
-        remote_storage: GenericRemoteStorage,
+        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
        cancel: &CancellationToken,
@@ -306,7 +308,9 @@ impl DeleteTenantFlow {

        let mut guard = Self::prepare(&tenant).await?;

-        if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
+        if let Err(e) =
+            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
+        {
            tenant.set_broken(format!("{e:#}")).await;
            return Err(e);
        }
@@ -323,7 +327,7 @@ impl DeleteTenantFlow {
    async fn run_inner(
        guard: &mut OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
-        remote_storage: &GenericRemoteStorage,
+        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
@@ -335,9 +339,14 @@ impl DeleteTenantFlow {
            ))?
        });

-        create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
-            .await
-            .context("remote_mark")?;
+        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
+        // Though sounds scary, different mark name?
+        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
+        if let Some(remote_storage) = &remote_storage {
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+                .await
+                .context("remote_mark")?
+        }

        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
            Err(anyhow::anyhow!(
@@ -474,7 +483,7 @@ impl DeleteTenantFlow {
    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
-        remote_storage: GenericRemoteStorage,
+        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
@@ -503,7 +512,7 @@ impl DeleteTenantFlow {
    async fn background(
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
-        remote_storage: GenericRemoteStorage,
+        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
@@ -542,7 +551,7 @@ impl DeleteTenantFlow {

        remove_tenant_remote_delete_mark(
            conf,
-            &remote_storage,
+            remote_storage.as_ref(),
            &tenant.tenant_shard_id,
            &task_mgr::shutdown_token(),
        )
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,7 +7,7 @@ use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId,
+    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
@@ -16,9 +16,10 @@ use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 use sysinfo::SystemExt;
 use tokio::fs;
+use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

 use anyhow::Context;
 use once_cell::sync::Lazy;
@@ -46,7 +47,7 @@ use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -118,7 +119,6 @@ pub(crate) enum TenantsMapRemoveResult {

 /// When resolving a TenantId to a shard, we may be looking for the 0th
 /// shard, or we might be looking for whichever shard holds a particular page.
-#[derive(Copy, Clone)]
 pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
@@ -127,8 +127,6 @@ pub(crate) enum ShardSelector {
    First,
    /// Pick the shard that holds this key
    Page(Key),
-    /// The shard ID is known: pick the given shard
-    Known(ShardIndex),
 }

 /// A convenience for use with the re_attach ControlPlaneClient function: rather
@@ -171,14 +169,6 @@ impl TenantStartupMode {
    }
 }

-/// Result type for looking up a TenantId to a specific shard
-pub(crate) enum ShardResolveResult {
-    NotFound,
-    Found(Arc<Tenant>),
-    // Wait for this barrrier, then query again
-    InProgress(utils::completion::Barrier),
-}
-
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -192,6 +182,51 @@ impl TenantsMap {
        }
    }

+    /// A page service client sends a TenantId, and to look up the correct Tenant we must
+    /// resolve this to a fully qualified TenantShardId.
+    fn resolve_attached_shard(
+        &self,
+        tenant_id: &TenantId,
+        selector: ShardSelector,
+    ) -> Option<TenantShardId> {
+        let mut want_shard = None;
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        _ => continue,
+                    };
+
+                    match selector {
+                        ShardSelector::First => return Some(*slot.0),
+                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
+                            return Some(*slot.0)
+                        }
+                        ShardSelector::Page(key) => {
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
+
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return Some(*slot.0);
+                            }
+                        }
+                        _ => continue,
+                    }
+                }
+
+                // Fall through: we didn't find an acceptable shard
+                None
+            }
+        }
+    }
+
    /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
    ///
    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -356,17 +391,22 @@ async fn init_load_generations(
    // deletion list entries may still be valid.  We provide that by pushing a recovery operation into
    // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions
    // are processed, even though we don't block on recovery completing here.
-    let attached_tenants = generations
-        .iter()
-        .flat_map(|(id, start_mode)| {
-            match start_mode {
-                TenantStartupMode::Attached((_mode, generation)) => Some(generation),
-                TenantStartupMode::Secondary => None,
-            }
-            .map(|gen| (*id, *gen))
-        })
-        .collect();
-    resources.deletion_queue_client.recover(attached_tenants)?;
+    //
+    // Must only do this if remote storage is enabled, otherwise deletion queue
+    // is not running and channel push will fail.
+    if resources.remote_storage.is_some() {
+        let attached_tenants = generations
+            .iter()
+            .flat_map(|(id, start_mode)| {
+                match start_mode {
+                    TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                    TenantStartupMode::Secondary => None,
+                }
+                .map(|gen| (*id, *gen))
+            })
+            .collect();
+        resources.deletion_queue_client.recover(attached_tenants)?;
+    }

    Ok(Some(generations))
 }
@@ -420,6 +460,53 @@ fn load_tenant_config(
        }
    };

+    // Clean up legacy `metadata` files.
+    // Doing it here because every single tenant directory is visited here.
+    // In any later code, there's different treatment of tenant dirs
+    // ... depending on whether the tenant is in re-attach response or not
+    // ... epending on whether the tenant is ignored or not
+    assert_eq!(
+        &conf.tenant_path(&tenant_shard_id),
+        &tenant_dir_path,
+        "later use of conf....path() methods would be dubious"
+    );
+    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
+        Ok(iter) => {
+            let mut timelines = Vec::new();
+            for res in iter {
+                let p = res?;
+                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
+                    // skip any entries that aren't TimelineId, such as
+                    // - *.___temp dirs
+                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
+                    continue;
+                };
+                timelines.push(timeline_id);
+            }
+            timelines
+        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
+        Err(e) => return Err(anyhow::anyhow!(e)),
+    };
+    for timeline_id in timelines {
+        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
+        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
+        match std::fs::remove_file(&metadata_path) {
+            Ok(()) => {
+                crashsafe::fsync(timeline_path)
+                    .context("fsync timeline dir after removing legacy metadata file")?;
+                info!("removed legacy metadata file at {metadata_path}");
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                // something removed the file earlier, or it was never there
+                // We don't care, this software version doesn't write it again, so, we're good.
+            }
+            Err(e) => {
+                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
+            }
+        }
+    }
+
    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
    if tenant_ignore_mark_file.exists() {
        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
@@ -524,7 +611,6 @@ pub async fn init_tenant_mgr(
                    TenantSlot::Attached(Tenant::create_broken_tenant(
                        conf,
                        tenant_shard_id,
-                        resources.remote_storage.clone(),
                        format!("{}", e),
                    )),
                );
@@ -717,7 +803,6 @@ fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    let remote_storage = resources.remote_storage.clone();
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
@@ -732,7 +817,7 @@ fn tenant_spawn(
        Ok(tenant) => tenant,
        Err(e) => {
            error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}"))
+            Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
        }
    };

@@ -2018,77 +2103,6 @@ impl TenantManager {

        Ok(reparented)
    }
-
-    /// A page service client sends a TenantId, and to look up the correct Tenant we must
-    /// resolve this to a fully qualified TenantShardId.
-    ///
-    /// During shard splits: we shall see parent shards in InProgress state and skip them, and
-    /// instead match on child shards which should appear in Attached state.  Very early in a shard
-    /// split, or in other cases where a shard is InProgress, we will return our own InProgress result
-    /// to instruct the caller to wait for that to finish before querying again.
-    pub(crate) fn resolve_attached_shard(
-        &self,
-        tenant_id: &TenantId,
-        selector: ShardSelector,
-    ) -> ShardResolveResult {
-        let tenants = self.tenants.read().unwrap();
-        let mut want_shard = None;
-        let mut any_in_progress = None;
-
-        match &*tenants {
-            TenantsMap::Initializing => ShardResolveResult::NotFound,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    // Ignore all slots that don't contain an attached tenant
-                    let tenant = match &slot.1 {
-                        TenantSlot::Attached(t) => t,
-                        TenantSlot::InProgress(barrier) => {
-                            // We might still find a usable shard, but in case we don't, remember that
-                            // we saw at least one InProgress slot, so that we can distinguish this case
-                            // from a simple NotFound in our return value.
-                            any_in_progress = Some(barrier.clone());
-                            continue;
-                        }
-                        _ => continue,
-                    };
-
-                    match selector {
-                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
-                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return ShardResolveResult::Found(tenant.clone())
-                        }
-                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                            }
-
-                            if Some(tenant.shard_identity.number) == want_shard {
-                                return ShardResolveResult::Found(tenant.clone());
-                            }
-                        }
-                        ShardSelector::Known(shard)
-                            if tenant.shard_identity.shard_index() == shard =>
-                        {
-                            return ShardResolveResult::Found(tenant.clone());
-                        }
-                        _ => continue,
-                    }
-                }
-
-                // Fall through: we didn't find a slot that was in Attached state & matched our selector.  If
-                // we found one or more InProgress slot, indicate to caller that they should retry later.  Otherwise
-                // this requested shard simply isn't found.
-                if let Some(barrier) = any_in_progress {
-                    ShardResolveResult::InProgress(barrier)
-                } else {
-                    ShardResolveResult::NotFound
-                }
-            }
-        }
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2137,6 +2151,105 @@ pub(crate) enum GetActiveTenantError {
    Broken(String),
 }

+/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`]
+/// state, then wait for up to `timeout`.  If the [`Tenant`] is not currently in [`TenantState::Active`],
+/// then wait for up to `timeout` (minus however long we waited for the slot).
+pub(crate) async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    shard_selector: ShardSelector,
+    timeout: Duration,
+    cancel: &CancellationToken,
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
+    enum WaitFor {
+        Barrier(utils::completion::Barrier),
+        Tenant(Arc<Tenant>),
+    }
+
+    let wait_start = Instant::now();
+    let deadline = wait_start + timeout;
+
+    let (wait_for, tenant_shard_id) = {
+        let locked = TENANTS.read().unwrap();
+
+        // Resolve TenantId to TenantShardId
+        let tenant_shard_id = locked
+            .resolve_attached_shard(&tenant_id, shard_selector)
+            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                tenant_id,
+            )))?;
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .map_err(GetTenantError::MapState)?;
+        match peek_slot {
+            Some(TenantSlot::Attached(tenant)) => {
+                match tenant.current_state() {
+                    TenantState::Active => {
+                        // Fast path: we don't need to do any async waiting.
+                        return Ok(tenant.clone());
+                    }
+                    _ => {
+                        tenant.activate_now();
+                        (WaitFor::Tenant(tenant.clone()), tenant_shard_id)
+                    }
+                }
+            }
+            Some(TenantSlot::Secondary(_)) => {
+                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
+                    tenant_shard_id,
+                )))
+            }
+            Some(TenantSlot::InProgress(barrier)) => {
+                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
+            }
+            None => {
+                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                    tenant_id,
+                )))
+            }
+        }
+    };
+
+    let tenant = match wait_for {
+        WaitFor::Barrier(barrier) => {
+            tracing::debug!("Waiting for tenant InProgress state to pass...");
+            timeout_cancellable(
+                deadline.duration_since(Instant::now()),
+                cancel,
+                barrier.wait(),
+            )
+            .await
+            .map_err(|e| match e {
+                TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state: None,
+                    wait_time: wait_start.elapsed(),
+                },
+                TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled,
+            })?;
+            {
+                let locked = TENANTS.read().unwrap();
+                let peek_slot =
+                    tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+                        .map_err(GetTenantError::MapState)?;
+                match peek_slot {
+                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
+                    _ => {
+                        return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
+                            tenant_shard_id,
+                        )))
+                    }
+                }
+            }
+        }
+        WaitFor::Tenant(tenant) => tenant,
+    };
+
+    tracing::debug!("Waiting for tenant to enter active state...");
+    tenant
+        .wait_to_become_active(deadline.duration_since(Instant::now()))
+        .await?;
+    Ok(tenant)
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
    #[error("Tenant {0}")]
@@ -2163,7 +2276,7 @@ pub(crate) async fn load_tenant(
    tenant_id: TenantId,
    generation: Generation,
    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: GenericRemoteStorage,
+    remote_storage: Option<GenericRemoteStorage>,
    deletion_queue_client: DeletionQueueClient,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -2767,73 +2880,86 @@ use {
    utils::http::error::ApiError,
 };

-#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
-pub(crate) async fn immediate_gc(
+pub(crate) fn immediate_gc(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
    cancel: CancellationToken,
    ctx: &RequestContext,
-) -> Result<GcResult, ApiError> {
-    let tenant = {
-        let guard = TENANTS.read().unwrap();
-        guard
-            .get(&tenant_shard_id)
-            .cloned()
-            .with_context(|| format!("tenant {tenant_shard_id}"))
-            .map_err(|e| ApiError::NotFound(e.into()))?
-    };
+) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
+    let guard = TENANTS.read().unwrap();
+
+    let tenant = guard
+        .get(&tenant_shard_id)
+        .cloned()
+        .with_context(|| format!("tenant {tenant_shard_id}"))
+        .map_err(|e| ApiError::NotFound(e.into()))?;

    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
    let pitr = tenant.get_pitr_interval();

-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
    // Run in task_mgr to avoid race with tenant_detach operation
-    let ctx: RequestContext =
-        ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    let span = info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);

-    let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
+    // TODO: spawning is redundant now, need to hold the gate
+    task_mgr::spawn(
+        &tokio::runtime::Handle::current(),
+        TaskKind::GarbageCollector,
+        Some(tenant_shard_id),
+        Some(timeline_id),
+        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
+        false,
+        async move {
+            fail::fail_point!("immediate_gc_task_pre");

-    fail::fail_point!("immediate_gc_task_pre");
+            #[allow(unused_mut)]
+            let mut result = tenant
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
+                .await;
+                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
+                // better once the types support it.

-    #[allow(unused_mut)]
-    let mut result = tenant
-        .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-        .await;
-    // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
-    // better once the types support it.
+            #[cfg(feature = "testing")]
+            {
+                // we need to synchronize with drop completion for python tests without polling for
+                // log messages
+                if let Ok(result) = result.as_mut() {
+                    let mut js = tokio::task::JoinSet::new();
+                    for layer in std::mem::take(&mut result.doomed_layers) {
+                        js.spawn(layer.wait_drop());
+                    }
+                    tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
+                    while let Some(res) = js.join_next().await {
+                        res.expect("wait_drop should not panic");
+                    }
+                }

-    #[cfg(feature = "testing")]
-    {
-        // we need to synchronize with drop completion for python tests without polling for
-        // log messages
-        if let Ok(result) = result.as_mut() {
-            let mut js = tokio::task::JoinSet::new();
-            for layer in std::mem::take(&mut result.doomed_layers) {
-                js.spawn(layer.wait_drop());
+                let timeline = tenant.get_timeline(timeline_id, false).ok();
+                let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
+
+                if let Some(rtc) = rtc {
+                    // layer drops schedule actions on remote timeline client to actually do the
+                    // deletions; don't care about the shutdown error, just exit fast
+                    drop(rtc.wait_completion().await);
+                }
            }
-            tracing::info!(
-                total = js.len(),
-                "starting to wait for the gc'd layers to be dropped"
-            );
-            while let Some(res) = js.join_next().await {
-                res.expect("wait_drop should not panic");
+
+            match task_done.send(result) {
+                Ok(_) => (),
+                Err(result) => error!("failed to send gc result: {result:?}"),
            }
+            Ok(())
        }
+        .instrument(span)
+    );

-        let timeline = tenant.get_timeline(timeline_id, false).ok();
-        let rtc = timeline.as_ref().map(|x| &x.remote_client);
+    // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
+    drop(guard);

-        if let Some(rtc) = rtc {
-            // layer drops schedule actions on remote timeline client to actually do the
-            // deletions; don't care about the shutdown error, just exit fast
-            drop(rtc.wait_completion().await);
-        }
-    }
-
-    result.map_err(ApiError::InternalServerError)
+    Ok(wait_task_done)
 }

 #[cfg(test)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -189,7 +189,6 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};

 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -197,7 +196,6 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::pausable_failpoint;

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
@@ -319,7 +317,7 @@ pub struct RemoteTimelineClient {

    upload_queue: Mutex<UploadQueue>,

-    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
+    metrics: Arc<RemoteTimelineClientMetrics>,

    storage_impl: GenericRemoteStorage,

@@ -463,11 +461,11 @@ impl RemoteTimelineClient {
        } else {
            0
        };
-        self.metrics.remote_physical_size_gauge.set(size);
+        self.metrics.remote_physical_size_set(size);
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_gauge.get()
+        self.metrics.remote_physical_size_get()
    }

    //
@@ -520,7 +518,6 @@ impl RemoteTimelineClient {
        &self,
        layer_file_name: &LayerName,
        layer_metadata: &LayerFileMetadata,
-        local_path: &Utf8Path,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<u64> {
@@ -539,7 +536,6 @@ impl RemoteTimelineClient {
                self.timeline_id,
                layer_file_name,
                layer_metadata,
-                local_path,
                cancel,
                ctx,
            )
@@ -613,17 +609,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated.
-    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
-        self: &Arc<Self>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-        upload_queue.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue);
-        Ok(())
-    }
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -1142,11 +1127,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    pub(crate) fn is_deleting(&self) -> bool {
-        let mut locked = self.upload_queue.lock().unwrap();
-        locked.stopped_mut().is_ok()
-    }
-
    pub(crate) async fn preserve_initdb_archive(
        self: &Arc<Self>,
        tenant_id: &TenantId,
@@ -1193,7 +1173,7 @@ impl RemoteTimelineClient {
                    &self.storage_impl,
                    uploaded.local_path(),
                    &remote_path,
-                    uploaded.metadata().file_size,
+                    uploaded.metadata().file_size(),
                    cancel,
                )
                .await
@@ -1574,7 +1554,7 @@ impl RemoteTimelineClient {
                        &self.storage_impl,
                        local_path,
                        &remote_path,
-                        layer_metadata.file_size,
+                        layer_metadata.file_size(),
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1769,7 +1749,7 @@ impl RemoteTimelineClient {
            UploadOp::UploadLayer(_, m) => (
                RemoteOpFileKind::Layer,
                RemoteOpKind::Upload,
-                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()),
            ),
            UploadOp::UploadMetadata(_, _) => (
                RemoteOpFileKind::Index,
@@ -1864,7 +1844,6 @@ impl RemoteTimelineClient {
                        dangling_files: HashMap::default(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-                        last_aux_file_policy: initialized.last_aux_file_policy,
                    };

                    let upload_queue = std::mem::replace(
@@ -2153,7 +2132,7 @@ mod tests {
            tenant_ctx: _tenant_ctx,
        } = test_setup;

-        let client = &timeline.remote_client;
+        let client = timeline.remote_client.as_ref().unwrap();

        // Download back the index.json, and check that the list of files is correct
        let initial_index_part = match client
@@ -2344,7 +2323,7 @@ mod tests {
            timeline,
            ..
        } = TestSetup::new("metrics").await.unwrap();
-        let client = &timeline.remote_client;
+        let client = timeline.remote_client.as_ref().unwrap();

        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let local_path = local_layer_path(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -21,6 +21,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
+use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
@@ -49,13 +50,19 @@ pub async fn download_layer_file<'a>(
    timeline_id: TimelineId,
    layer_file_name: &'a LayerName,
    layer_metadata: &'a LayerFileMetadata,
-    local_path: &Utf8Path,
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id);
+    let local_path = local_layer_path(
+        conf,
+        &tenant_shard_id,
+        &timeline_id,
+        layer_file_name,
+        &layer_metadata.generation,
+    );

    let remote_path = remote_layer_path(
        &tenant_shard_id.tenant_id,
@@ -75,7 +82,7 @@ pub async fn download_layer_file<'a>(
    // For more context about durable_rename check this email from postgres mailing list:
    // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
+    let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);

    let bytes_amount = download_retry(
        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
@@ -84,7 +91,7 @@ pub async fn download_layer_file<'a>(
    )
    .await?;

-    let expected = layer_metadata.file_size;
+    let expected = layer_metadata.file_size();
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -5,7 +5,6 @@
 use std::collections::HashMap;

 use chrono::NaiveDateTime;
-use pageserver_api::models::AuxFilePolicy;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;

@@ -17,6 +16,46 @@ use pageserver_api::shard::ShardIndex;

 use utils::lsn::Lsn;

+/// Metadata gathered for each of the layer files.
+///
+/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
+/// might have less or more metadata depending if upgrading or rolling back an upgrade.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
+//#[cfg_attr(test, derive(Default))]
+pub struct LayerFileMetadata {
+    file_size: u64,
+
+    pub(crate) generation: Generation,
+
+    pub(crate) shard: ShardIndex,
+}
+
+impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
+    fn from(other: &IndexLayerMetadata) -> Self {
+        LayerFileMetadata {
+            file_size: other.file_size,
+            generation: other.generation,
+            shard: other.shard,
+        }
+    }
+}
+
+impl LayerFileMetadata {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
+        LayerFileMetadata {
+            file_size,
+            generation,
+            shard,
+        }
+    }
+
+    pub fn file_size(&self) -> u64 {
+        self.file_size
+    }
+}
+
+// TODO seems like another part of the remote storage file format
+// compatibility issue, see https://github.com/neondatabase/neon/issues/3072
 /// In-memory representation of an `index_part.json` file
 ///
 /// Contains the data about all files in the timeline, present remotely and its metadata.
@@ -37,7 +76,7 @@ pub struct IndexPart {
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
-    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,
+    pub layer_metadata: HashMap<LayerName, IndexLayerMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated for convenience when reading the serialized structure, but is
@@ -49,16 +88,6 @@ pub struct IndexPart {

    #[serde(default)]
    pub(crate) lineage: Lineage,
-
-    /// Describes the kind of aux files stored in the timeline.
-    ///
-    /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable.
-    /// A V1 setting after V2 files have been committed is not accepted.
-    ///
-    /// None means no aux files have been written to the storage before the point
-    /// when this flag is introduced.
-    #[serde(skip_serializing_if = "Option::is_none", default)]
-    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
 }

 impl IndexPart {
@@ -72,11 +101,10 @@ impl IndexPart {
    ///      is always generated from the keys of `layer_metadata`)
    /// - 4: timeline_layers is fully removed.
    /// - 5: lineage was added
-    /// - 6: last_aux_file_policy is added.
-    const LATEST_VERSION: usize = 6;
+    const LATEST_VERSION: usize = 5;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -85,9 +113,11 @@ impl IndexPart {
        disk_consistent_lsn: Lsn,
        metadata: TimelineMetadata,
        lineage: Lineage,
-        last_aux_file_policy: Option<AuxFilePolicy>,
    ) -> Self {
-        let layer_metadata = layers_and_metadata.clone();
+        let layer_metadata = layers_and_metadata
+            .iter()
+            .map(|(k, v)| (k.to_owned(), IndexLayerMetadata::from(v)))
+            .collect();

        Self {
            version: Self::LATEST_VERSION,
@@ -96,7 +126,6 @@ impl IndexPart {
            metadata,
            deleted_at: None,
            lineage,
-            last_aux_file_policy,
        }
    }

@@ -126,13 +155,8 @@ impl IndexPart {
            example_metadata.disk_consistent_lsn(),
            example_metadata,
            Default::default(),
-            Some(AuxFilePolicy::V1),
        )
    }
-
-    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
-        self.last_aux_file_policy
-    }
 }

 impl From<&UploadQueueInitialized> for IndexPart {
@@ -141,22 +165,13 @@ impl From<&UploadQueueInitialized> for IndexPart {
        let metadata = uq.latest_metadata.clone();
        let lineage = uq.latest_lineage.clone();

-        Self::new(
-            &uq.latest_files,
-            disk_consistent_lsn,
-            metadata,
-            lineage,
-            uq.last_aux_file_policy,
-        )
+        Self::new(&uq.latest_files, disk_consistent_lsn, metadata, lineage)
    }
 }

-/// Metadata gathered for each of the layer files.
-///
-/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
-/// might have less or more metadata depending if upgrading or rolling back an upgrade.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
-pub struct LayerFileMetadata {
+/// Serialized form of [`LayerFileMetadata`].
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+pub struct IndexLayerMetadata {
    pub file_size: u64,

    #[serde(default = "Generation::none")]
@@ -168,12 +183,12 @@ pub struct LayerFileMetadata {
    pub shard: ShardIndex,
 }

-impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
-        LayerFileMetadata {
-            file_size,
-            generation,
-            shard,
+impl From<&LayerFileMetadata> for IndexLayerMetadata {
+    fn from(other: &LayerFileMetadata) -> Self {
+        IndexLayerMetadata {
+            file_size: other.file_size,
+            generation: other.generation,
+            shard: other.shard,
        }
    }
 }
@@ -267,12 +282,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -284,7 +299,6 @@ mod tests {
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
            lineage: Lineage::default(),
-            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -309,12 +323,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -326,7 +340,6 @@ mod tests {
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: None,
            lineage: Lineage::default(),
-            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -352,12 +365,12 @@ mod tests {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -370,7 +383,6 @@ mod tests {
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
            lineage: Lineage::default(),
-            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -416,7 +428,6 @@ mod tests {
            .unwrap(),
            deleted_at: None,
            lineage: Lineage::default(),
-            last_aux_file_policy: None,
        };

        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
@@ -440,12 +451,12 @@ mod tests {
        let expected = IndexPart {
            version: 4,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
@@ -457,7 +468,6 @@ mod tests {
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            lineage: Lineage::default(),
-            last_aux_file_policy: None,
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
@@ -482,12 +492,12 @@ mod tests {
        let expected = IndexPart {
            version: 5,
            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), IndexLayerMetadata {
                    file_size: 23289856,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata {
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), IndexLayerMetadata {
                    file_size: 1015808,
                    generation: Generation::new(1),
                    shard: ShardIndex::unsharded(),
@@ -501,57 +511,6 @@ mod tests {
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
            },
-            last_aux_file_policy: None,
-        };
-
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
-        assert_eq!(part, expected);
-    }
-
-    #[test]
-    fn v6_indexpart_is_parsed() {
-        let example = r#"{
-            "version":6,
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
-            "deleted_at": "2023-07-31T09:00:00.123",
-            "lineage":{
-                "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"],
-                "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"]
-            },
-            "last_aux_file_policy": "V2"
-        }"#;
-
-        let expected = IndexPart {
-            version: 6,
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
-                    file_size: 25600000,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: 9007199254741001,
-                    generation: Generation::none(),
-                    shard: ShardIndex::unsharded()
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
-            lineage: Lineage {
-                reparenting_history_truncated: false,
-                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
-                original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))),
-            },
-            last_aux_file_policy: Some(AuxFilePolicy::V2),
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -9,7 +9,7 @@ use std::time::SystemTime;
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
-use utils::{backoff, pausable_failpoint};
+use utils::backoff;

 use super::Generation;
 use crate::tenant::remote_timeline_client::{
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,9 +6,11 @@ mod scheduler;
 use std::{sync::Arc, time::SystemTime};

 use crate::{
+    config::PageServerConf,
    context::RequestContext,
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    virtual_file::MaybeFatalIo,
 };

 use self::{
@@ -19,8 +21,9 @@ use self::{
 use super::{
    config::{SecondaryLocationConfig, TenantConfOpt},
    mgr::TenantManager,
+    remote_timeline_client::LayerFileMetadata,
    span::debug_assert_current_span_has_tenant_id,
-    storage_layer::LayerName,
+    storage_layer::{layer::local_layer_path, LayerName},
 };

 use pageserver_api::{
@@ -175,7 +178,13 @@ impl SecondaryTenant {

    /// Cancellation safe, but on cancellation the eviction will go through
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
-    pub(crate) async fn evict_layer(self: &Arc<Self>, timeline_id: TimelineId, name: LayerName) {
+    pub(crate) async fn evict_layer(
+        self: &Arc<Self>,
+        conf: &PageServerConf,
+        timeline_id: TimelineId,
+        name: LayerName,
+        metadata: LayerFileMetadata,
+    ) {
        debug_assert_current_span_has_tenant_id();

        let guard = match self.gate.enter() {
@@ -188,11 +197,41 @@ impl SecondaryTenant {

        let now = SystemTime::now();

+        let local_path = local_layer_path(
+            conf,
+            &self.tenant_shard_id,
+            &timeline_id,
+            &name,
+            &metadata.generation,
+        );
+
        let this = self.clone();

        // spawn it to be cancellation safe
        tokio::task::spawn_blocking(move || {
            let _guard = guard;
+            // We tolerate ENOENT, because between planning eviction and executing
+            // it, the secondary downloader could have seen an updated heatmap that
+            // resulted in a layer being deleted.
+            // Other local I/O errors are process-fatal: these should never happen.
+            let deleted = std::fs::remove_file(local_path);
+
+            let not_found = deleted
+                .as_ref()
+                .is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
+
+            let deleted = if not_found {
+                false
+            } else {
+                deleted
+                    .map(|()| true)
+                    .fatal_err("Deleting layer during eviction")
+            };
+
+            if !deleted {
+                // skip updating accounting and putting perhaps later timestamp
+                return;
+            }

            // Update the timeline's state.  This does not have to be synchronized with
            // the download process, because:
@@ -211,15 +250,8 @@ impl SecondaryTenant {
            // of the cache.
            let mut detail = this.detail.lock().unwrap();
            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                let removed = timeline_detail.on_disk_layers.remove(&name);
-
-                // We might race with removal of the same layer during downloads, if it was removed
-                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
-                // do a physical deletion or store in evicted_at.
-                if let Some(removed) = removed {
-                    removed.remove_blocking();
-                    timeline_detail.evicted_at.insert(name, now);
-                }
+                timeline_detail.on_disk_layers.remove(&name);
+                timeline_detail.evicted_at.insert(name, now);
            }
        })
        .await
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -26,7 +26,7 @@ use crate::{
        tasks::{warn_when_period_overrun, BackgroundLoopKind},
    },
    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
-    TEMP_FILE_SUFFIX,
+    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
 };

 use super::{
@@ -62,10 +62,14 @@ use super::{
    CommandRequest, DownloadCommand,
 };

-/// For each tenant, default period for how long must have passed since the last download_tenant call before
-/// calling it again.  This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
-/// download, if the uploader populated it.
-const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);
+/// For each tenant, how long must have passed since the last download_tenant call before
+/// calling it again.  This is approximately the time by which local data is allowed
+/// to fall behind remote data.
+///
+/// TODO: this should just be a default, and the actual period should be controlled
+/// via the heatmap itself
+/// `<ttps://github.com/neondatabase/neon/issues/6200>`
+const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);

 pub(super) async fn downloader_task(
    tenant_manager: Arc<TenantManager>,
@@ -86,7 +90,7 @@ pub(super) async fn downloader_task(

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("secondary_download_scheduler"))
+        .instrument(info_span!("secondary_downloads"))
        .await
 }

@@ -100,7 +104,6 @@ struct SecondaryDownloader {
 pub(super) struct OnDiskState {
    metadata: LayerFileMetadata,
    access_time: SystemTime,
-    local_path: Utf8PathBuf,
 }

 impl OnDiskState {
@@ -111,26 +114,12 @@ impl OnDiskState {
        _ame: LayerName,
        metadata: LayerFileMetadata,
        access_time: SystemTime,
-        local_path: Utf8PathBuf,
    ) -> Self {
        Self {
            metadata,
            access_time,
-            local_path,
        }
    }
-
-    // This is infallible, because all errors are either acceptable (ENOENT), or totally
-    // unexpected (fatal).
-    pub(super) fn remove_blocking(&self) {
-        // We tolerate ENOENT, because between planning eviction and executing
-        // it, the secondary downloader could have seen an updated heatmap that
-        // resulted in a layer being deleted.
-        // Other local I/O errors are process-fatal: these should never happen.
-        std::fs::remove_file(&self.local_path)
-            .or_else(fs_ext::ignore_not_found)
-            .fatal_err("Deleting secondary layer")
-    }
 }

 #[derive(Debug, Clone, Default)]
@@ -141,22 +130,14 @@ pub(super) struct SecondaryDetailTimeline {
    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }

-// Aspects of a heatmap that we remember after downloading it
-#[derive(Clone, Debug)]
-struct DownloadSummary {
-    etag: Etag,
-    #[allow(unused)]
-    mtime: SystemTime,
-    upload_period: Duration,
-}
-
 /// This state is written by the secondary downloader, it is opaque
 /// to TenantManager
 #[derive(Debug)]
 pub(super) struct SecondaryDetail {
    pub(super) config: SecondaryLocationConfig,

-    last_download: Option<DownloadSummary>,
+    last_download: Option<Instant>,
+    last_etag: Option<Etag>,
    next_download: Option<Instant>,
    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -186,6 +167,7 @@ impl SecondaryDetail {
        Self {
            config,
            last_download: None,
+            last_etag: None,
            next_download: None,
            timelines: HashMap::new(),
        }
@@ -239,8 +221,9 @@ impl SecondaryDetail {

 struct PendingDownload {
    secondary_state: Arc<SecondaryTenant>,
-    last_download: Option<DownloadSummary>,
+    last_download: Option<Instant>,
    target_time: Option<Instant>,
+    period: Option<Duration>,
 }

 impl scheduler::PendingJob for PendingDownload {
@@ -290,17 +273,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo

        tracing::debug!("Secondary tenant download completed");

+        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
+        // take priority to run again.
        let mut detail = secondary_state.detail.lock().unwrap();
-
-        let period = detail
-            .last_download
-            .as_ref()
-            .map(|d| d.upload_period)
-            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
-
-        // We advance next_download irrespective of errors: we don't want error cases to result in
-        // expensive busy-polling.
-        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
+        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
    }

    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -333,11 +309,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    if detail.next_download.is_none() {
                        // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
                        // rounds will use a smaller jitter to avoid accidentally synchronizing later.
-                        detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
+                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
                        "Using our constant, which is known to be small compared with clock range",
                    ));
                    }
-                    (detail.last_download.clone(), detail.next_download.unwrap())
+                    (detail.last_download, detail.next_download.unwrap())
                };

                if now > next_download {
@@ -345,6 +321,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                        secondary_state: secondary_tenant,
                        last_download,
                        target_time: Some(next_download),
+                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
                    })
                } else {
                    None
@@ -370,6 +347,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo

        Ok(PendingDownload {
            target_time: None,
+            period: None,
            last_download: None,
            secondary_state: tenant,
        })
@@ -386,6 +364,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            secondary_state,
            last_download,
            target_time,
+            period,
        } = job;

        let (completion, barrier) = utils::completion::channel();
@@ -407,7 +386,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
                }
                Err(UpdateError::Cancelled) => {
-                    tracing::info!("Shut down while downloading");
+                    tracing::debug!("Shut down while downloading");
                },
                Err(UpdateError::Deserialize(e)) => {
                    tracing::error!("Corrupt content while downloading tenant: {e}");
@@ -422,15 +401,20 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo

            // If the job had a target execution time, we may check our final execution
            // time against that for observability purposes.
-            if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
-                // Elapsed time includes any scheduling lag as well as the execution of the job
-                let elapsed = Instant::now().duration_since(target_time);
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Only track execution lag if this isn't our first download: otherwise, it is expected
+                // that execution will have taken longer than our configured interval, for example
+                // when starting up a pageserver and
+                if last_download.is_some() {
+                    // Elapsed time includes any scheduling lag as well as the execution of the job
+                    let elapsed = Instant::now().duration_since(target_time);

-                warn_when_period_overrun(
-                    elapsed,
-                    last_download.upload_period,
-                    BackgroundLoopKind::SecondaryDownload,
-                );
+                    warn_when_period_overrun(
+                        elapsed,
+                        period,
+                        BackgroundLoopKind::SecondaryDownload,
+                    );
+                }
            }

            CompleteDownload {
@@ -519,12 +503,12 @@ impl<'a> TenantDownloader<'a> {
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();

        // We will use the etag from last successful download to make the download conditional on changes
-        let last_download = self
+        let last_etag = self
            .secondary_state
            .detail
            .lock()
            .unwrap()
-            .last_download
+            .last_etag
            .clone();

        // Download the tenant's heatmap
@@ -533,7 +517,7 @@ impl<'a> TenantDownloader<'a> {
            etag: heatmap_etag,
            bytes: heatmap_bytes,
        } = match tokio::select!(
-            bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
+            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
            _ = self.secondary_state.cancel.cancelled() => return Ok(())
        ) {
            HeatMapDownload::Unmodified => {
@@ -562,39 +546,6 @@ impl<'a> TenantDownloader<'a> {
            heatmap.timelines.len()
        );

-        // Get or initialize the local disk state for the timelines we will update
-        let mut timeline_states = HashMap::new();
-        for timeline in &heatmap.timelines {
-            let timeline_state = self
-                .secondary_state
-                .detail
-                .lock()
-                .unwrap()
-                .timelines
-                .get(&timeline.timeline_id)
-                .cloned();
-
-            let timeline_state = match timeline_state {
-                Some(t) => t,
-                None => {
-                    // We have no existing state: need to scan local disk for layers first.
-                    let timeline_state =
-                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
-
-                    // Re-acquire detail lock now that we're done with async load from local FS
-                    self.secondary_state
-                        .detail
-                        .lock()
-                        .unwrap()
-                        .timelines
-                        .insert(timeline.timeline_id, timeline_state.clone());
-                    timeline_state
-                }
-            };
-
-            timeline_states.insert(timeline.timeline_id, timeline_state);
-        }
-
        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
        // principle that deletions should be done before writes wherever possible, and so that we can use this
        // phase to initialize our SecondaryProgress.
@@ -605,10 +556,6 @@ impl<'a> TenantDownloader<'a> {

        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
-            let timeline_state = timeline_states
-                .remove(&timeline.timeline_id)
-                .expect("Just populated above");
-
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!(
                    "Cancelled before downloading timeline {}",
@@ -618,7 +565,7 @@ impl<'a> TenantDownloader<'a> {
            }

            let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, timeline_state, ctx)
+            self.download_timeline(timeline, ctx)
                .instrument(tracing::info_span!(
                    "secondary_download_timeline",
                    tenant_id=%tenant_shard_id.tenant_id,
@@ -630,30 +577,7 @@ impl<'a> TenantDownloader<'a> {

        // Only update last_etag after a full successful download: this way will not skip
        // the next download, even if the heatmap's actual etag is unchanged.
-        self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
-            etag: heatmap_etag,
-            mtime: heatmap_mtime,
-            upload_period: heatmap
-                .upload_period_ms
-                .map(|ms| Duration::from_millis(ms as u64))
-                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
-        });
-
-        // Robustness: we should have updated progress properly, but in case we didn't, make sure
-        // we don't leave the tenant in a state where we claim to have successfully downloaded
-        // everything, but our progress is incomplete.  The invariant here should be that if
-        // we have set `last_download` to this heatmap's etag, then the next time we see that
-        // etag we can safely do no work (i.e. we must be complete).
-        let mut progress = self.secondary_state.progress.lock().unwrap();
-        debug_assert!(progress.layers_downloaded == progress.layers_total);
-        debug_assert!(progress.bytes_downloaded == progress.bytes_total);
-        if progress.layers_downloaded != progress.layers_total
-            || progress.bytes_downloaded != progress.bytes_total
-        {
-            tracing::warn!("Correcting drift in progress stats ({progress:?})");
-            progress.layers_downloaded = progress.layers_total;
-            progress.bytes_downloaded = progress.bytes_total;
-        }
+        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);

        Ok(())
    }
@@ -709,7 +633,7 @@ impl<'a> TenantDownloader<'a> {
                let mut layer_byte_count: u64 = timeline_state
                    .on_disk_layers
                    .values()
-                    .map(|l| l.metadata.file_size)
+                    .map(|l| l.metadata.file_size())
                    .sum();

                // Remove on-disk layers that are no longer present in heatmap
@@ -720,7 +644,7 @@ impl<'a> TenantDownloader<'a> {
                        .get(layer_file_name)
                        .unwrap()
                        .metadata
-                        .file_size;
+                        .file_size();

                    let local_path = local_layer_path(
                        self.conf,
@@ -830,7 +754,6 @@ impl<'a> TenantDownloader<'a> {
    async fn download_timeline(
        &self,
        timeline: HeatMapTimeline,
-        timeline_state: SecondaryDetailTimeline,
        ctx: &RequestContext,
    ) -> Result<(), UpdateError> {
        debug_assert_current_span_has_tenant_and_timeline_id();
@@ -839,6 +762,34 @@ impl<'a> TenantDownloader<'a> {
        // Accumulate updates to the state
        let mut touched = Vec::new();

+        // Clone a view of what layers already exist on disk
+        let timeline_state = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                let timeline_state =
+                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
+
+                // Re-acquire detail lock now that we're done with async load from local FS
+                self.secondary_state
+                    .detail
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(timeline.timeline_id, timeline_state.clone());
+                timeline_state
+            }
+        };
+
        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());

        // Download heatmap layers that are not present on local disk, or update their
@@ -856,12 +807,20 @@ impl<'a> TenantDownloader<'a> {
                if cfg!(debug_assertions) {
                    // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think
                    // are already present on disk are really there.
-                    match tokio::fs::metadata(&on_disk.local_path).await {
+                    let local_path = local_layer_path(
+                        self.conf,
+                        tenant_shard_id,
+                        &timeline.timeline_id,
+                        &layer.name,
+                        &layer.metadata.generation,
+                    );
+
+                    match tokio::fs::metadata(&local_path).await {
                        Ok(meta) => {
                            tracing::debug!(
                                "Layer {} present at {}, size {}",
                                layer.name,
-                                on_disk.local_path,
+                                local_path,
                                meta.len(),
                            );
                        }
@@ -869,7 +828,7 @@ impl<'a> TenantDownloader<'a> {
                            tracing::warn!(
                                "Layer {} not found at {} ({})",
                                layer.name,
-                                on_disk.local_path,
+                                local_path,
                                e
                            );
                            debug_assert!(false);
@@ -877,7 +836,9 @@ impl<'a> TenantDownloader<'a> {
                    }
                }

-                if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time {
+                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
                    // We already have this layer on disk.  Update its access time.
                    tracing::debug!(
                        "Access time updated for layer {}: {} -> {}",
@@ -913,16 +874,67 @@ impl<'a> TenantDownloader<'a> {
                }
            }

-            match self
-                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
-                .await?
+            // Failpoint for simulating slow remote storage
+            failpoint_support::sleep_millis_async!(
+                "secondary-layer-download-sleep",
+                &self.secondary_state.cancel
+            );
+
+            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+            let downloaded_bytes = match download_layer_file(
+                self.conf,
+                self.remote_storage,
+                *tenant_shard_id,
+                timeline.timeline_id,
+                &layer.name,
+                &LayerFileMetadata::from(&layer.metadata),
+                &self.secondary_state.cancel,
+                ctx,
+            )
+            .await
            {
-                Some(layer) => touched.push(layer),
-                None => {
-                    // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
-                    // things to consider touched.
+                Ok(bytes) => bytes,
+                Err(DownloadError::NotFound) => {
+                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                    // This is harmless: continue to download the next layer. It is expected during compaction
+                    // GC.
+                    tracing::debug!(
+                        "Skipped downloading missing layer {}, raced with compaction/gc?",
+                        layer.name
+                    );
+                    continue;
                }
+                Err(e) => return Err(e.into()),
+            };
+
+            if downloaded_bytes != layer.metadata.file_size {
+                let local_path = local_layer_path(
+                    self.conf,
+                    tenant_shard_id,
+                    &timeline.timeline_id,
+                    &layer.name,
+                    &layer.metadata.generation,
+                );
+
+                tracing::warn!(
+                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                    layer.name,
+                    downloaded_bytes,
+                    layer.metadata.file_size
+                );
+
+                tokio::fs::remove_file(&local_path)
+                    .await
+                    .or_else(fs_ext::ignore_not_found)?;
+            } else {
+                tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.bytes_downloaded += downloaded_bytes;
+                progress.layers_downloaded += 1;
            }
+
+            SECONDARY_MODE.download_layer.inc();
+            touched.push(layer)
        }

        // Write updates to state to record layers we just downloaded or touched.
@@ -939,21 +951,13 @@ impl<'a> TenantDownloader<'a> {
                        v.get_mut().access_time = t.access_time;
                    }
                    Entry::Vacant(e) => {
-                        let local_path = local_layer_path(
-                            self.conf,
-                            tenant_shard_id,
-                            &timeline.timeline_id,
-                            &t.name,
-                            &t.metadata.generation,
-                        );
                        e.insert(OnDiskState::new(
                            self.conf,
                            tenant_shard_id,
                            &timeline.timeline_id,
                            t.name,
-                            t.metadata.clone(),
+                            LayerFileMetadata::from(&t.metadata),
                            t.access_time,
-                            local_path,
                        ));
                    }
                }
@@ -962,99 +966,6 @@ impl<'a> TenantDownloader<'a> {

        Ok(())
    }
-
-    async fn download_layer(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        layer: HeatMapLayer,
-        ctx: &RequestContext,
-    ) -> Result<Option<HeatMapLayer>, UpdateError> {
-        // Failpoint for simulating slow remote storage
-        failpoint_support::sleep_millis_async!(
-            "secondary-layer-download-sleep",
-            &self.secondary_state.cancel
-        );
-
-        let local_path = local_layer_path(
-            self.conf,
-            tenant_shard_id,
-            timeline_id,
-            &layer.name,
-            &layer.metadata.generation,
-        );
-
-        // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
-        tracing::info!(
-            "Starting download of layer {}, size {}",
-            layer.name,
-            layer.metadata.file_size
-        );
-        let downloaded_bytes = match download_layer_file(
-            self.conf,
-            self.remote_storage,
-            *tenant_shard_id,
-            *timeline_id,
-            &layer.name,
-            &layer.metadata,
-            &local_path,
-            &self.secondary_state.cancel,
-            ctx,
-        )
-        .await
-        {
-            Ok(bytes) => bytes,
-            Err(DownloadError::NotFound) => {
-                // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                // This is harmless: continue to download the next layer. It is expected during compaction
-                // GC.
-                tracing::debug!(
-                    "Skipped downloading missing layer {}, raced with compaction/gc?",
-                    layer.name
-                );
-
-                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
-                let mut progress = self.secondary_state.progress.lock().unwrap();
-                progress.layers_total = progress.layers_total.saturating_sub(1);
-                progress.bytes_total = progress
-                    .bytes_total
-                    .saturating_sub(layer.metadata.file_size);
-
-                return Ok(None);
-            }
-            Err(e) => return Err(e.into()),
-        };
-
-        if downloaded_bytes != layer.metadata.file_size {
-            let local_path = local_layer_path(
-                self.conf,
-                tenant_shard_id,
-                timeline_id,
-                &layer.name,
-                &layer.metadata.generation,
-            );
-
-            tracing::warn!(
-                "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
-                layer.name,
-                downloaded_bytes,
-                layer.metadata.file_size
-            );
-
-            tokio::fs::remove_file(&local_path)
-                .await
-                .or_else(fs_ext::ignore_not_found)?;
-        } else {
-            tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
-            let mut progress = self.secondary_state.progress.lock().unwrap();
-            progress.bytes_downloaded += downloaded_bytes;
-            progress.layers_downloaded += 1;
-        }
-
-        SECONDARY_MODE.download_layer.inc();
-
-        Ok(Some(layer))
-    }
 }

 /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
@@ -1104,7 +1015,11 @@ async fn init_timeline_state(
            .fatal_err(&format!("Read metadata on {}", file_path));

        let file_name = file_path.file_name().expect("created it from the dentry");
-        if crate::is_temporary(&file_path)
+        if file_name == METADATA_FILE_NAME {
+            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
+            continue;
+        } else if crate::is_temporary(&file_path)
            || is_temp_download_file(&file_path)
            || is_ephemeral_file(file_name)
        {
@@ -1144,9 +1059,8 @@ async fn init_timeline_state(
                                    tenant_shard_id,
                                    &heatmap.timeline_id,
                                    name,
-                                    remote_meta.metadata.clone(),
+                                    LayerFileMetadata::from(&remote_meta.metadata),
                                    remote_meta.access_time,
-                                    file_path,
                                ),
                            );
                        }
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,6 +1,6 @@
 use std::time::SystemTime;

-use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};
+use crate::tenant::{remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerName};

 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
@@ -15,14 +15,6 @@ pub(super) struct HeatMapTenant {
    pub(super) generation: Generation,

    pub(super) timelines: Vec<HeatMapTimeline>,
-
-    /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders
-    /// of how frequently it is worthwhile to check for updates.
-    ///
-    /// This is optional for backward compat, and because we sometimes might upload
-    /// a heatmap explicitly via API for a tenant that has no periodic upload configured.
-    #[serde(default)]
-    pub(super) upload_period_ms: Option<u128>,
 }

 #[serde_as]
@@ -38,7 +30,7 @@ pub(crate) struct HeatMapTimeline {
 #[derive(Serialize, Deserialize)]
 pub(crate) struct HeatMapLayer {
    pub(super) name: LayerName,
-    pub(super) metadata: LayerFileMetadata,
+    pub(super) metadata: IndexLayerMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
    pub(super) access_time: SystemTime,
@@ -49,7 +41,7 @@ pub(crate) struct HeatMapLayer {
 impl HeatMapLayer {
    pub(crate) fn new(
        name: LayerName,
-        metadata: LayerFileMetadata,
+        metadata: IndexLayerMetadata,
        access_time: SystemTime,
    ) -> Self {
        Self {
@@ -89,21 +81,4 @@ impl HeatMapTenant {

        stats
    }
-
-    pub(crate) fn strip_atimes(self) -> Self {
-        Self {
-            timelines: self
-                .timelines
-                .into_iter()
-                .map(|mut tl| {
-                    for layer in &mut tl.layers {
-                        layer.access_time = SystemTime::UNIX_EPOCH;
-                    }
-                    tl
-                })
-                .collect(),
-            generation: self.generation,
-            upload_period_ms: self.upload_period_ms,
-        }
-    }
 }
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -53,7 +53,7 @@ pub(super) async fn heatmap_uploader_task(

    scheduler
        .run(command_queue, background_jobs_can_start, cancel)
-        .instrument(info_span!("heatmap_upload_scheduler"))
+        .instrument(info_span!("heatmap_uploader"))
        .await
 }

@@ -80,7 +80,7 @@ impl RunningJob for WriteInProgress {

 struct UploadPending {
    tenant: Arc<Tenant>,
-    last_upload: Option<LastUploadState>,
+    last_digest: Option<md5::Digest>,
    target_time: Option<Instant>,
    period: Option<Duration>,
 }
@@ -94,7 +94,7 @@ impl scheduler::PendingJob for UploadPending {
 struct WriteComplete {
    tenant_shard_id: TenantShardId,
    completed_at: Instant,
-    uploaded: Option<LastUploadState>,
+    digest: Option<md5::Digest>,
    next_upload: Option<Instant>,
 }

@@ -115,7 +115,10 @@ struct UploaderTenantState {
    tenant: Weak<Tenant>,

    /// Digest of the serialized heatmap that we last successfully uploaded
-    last_upload_state: Option<LastUploadState>,
+    ///
+    /// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
+    /// which is also an md5sum.
+    last_digest: Option<md5::Digest>,

    /// When the last upload attempt completed (may have been successful or failed)
    last_upload: Option<Instant>,
@@ -184,7 +187,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                    tenant: Arc::downgrade(&tenant),
                    last_upload: None,
                    next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)),
-                    last_upload_state: None,
+                    last_digest: None,
                });

            // Decline to do the upload if insufficient time has passed
@@ -192,10 +195,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                return;
            }

-            let last_upload = state.last_upload_state.clone();
+            let last_digest = state.last_digest;
            result.jobs.push(UploadPending {
                tenant,
-                last_upload,
+                last_digest,
                target_time: state.next_upload,
                period: Some(period),
            });
@@ -215,7 +218,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
    ) {
        let UploadPending {
            tenant,
-            last_upload,
+            last_digest,
            target_time,
            period,
        } = job;
@@ -228,16 +231,16 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            let _completion = completion;

            let started_at = Instant::now();
-            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
-                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
+            let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
+                Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
                    let duration = Instant::now().duration_since(started_at);
                    SECONDARY_MODE
                        .upload_heatmap_duration
                        .observe(duration.as_secs_f64());
                    SECONDARY_MODE.upload_heatmap.inc();
-                    Some(uploaded)
+                    Some(digest)
                }
-                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
+                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
                Err(UploadHeatmapError::Upload(e)) => {
                    tracing::warn!(
                        "Failed to upload heatmap for tenant {}: {e:#}",
@@ -248,11 +251,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                        .upload_heatmap_duration
                        .observe(duration.as_secs_f64());
                    SECONDARY_MODE.upload_heatmap_errors.inc();
-                    last_upload
+                    last_digest
                }
                Err(UploadHeatmapError::Cancelled) => {
                    tracing::info!("Cancelled heatmap upload, shutting down");
-                    last_upload
+                    last_digest
                }
            };

@@ -274,7 +277,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            WriteComplete {
                    tenant_shard_id: *tenant.get_tenant_shard_id(),
                    completed_at: now,
-                    uploaded,
+                    digest,
                    next_upload,
                }
        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
@@ -296,7 +299,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>

        Ok(UploadPending {
            // Ignore our state for last digest: this forces an upload even if nothing has changed
-            last_upload: None,
+            last_digest: None,
            tenant,
            target_time: None,
            period: None,
@@ -309,7 +312,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
        let WriteComplete {
            tenant_shard_id,
            completed_at,
-            uploaded,
+            digest,
            next_upload,
        } = completion;
        use std::collections::hash_map::Entry;
@@ -319,7 +322,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            }
            Entry::Occupied(mut entry) => {
                entry.get_mut().last_upload = Some(completed_at);
-                entry.get_mut().last_upload_state = uploaded;
+                entry.get_mut().last_digest = digest;
                entry.get_mut().next_upload = next_upload
            }
        }
@@ -328,7 +331,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>

 enum UploadHeatmapOutcome {
    /// We successfully wrote to remote storage, with this digest.
-    Uploaded(LastUploadState),
+    Uploaded(md5::Digest),
    /// We did not upload because the heatmap digest was unchanged since the last upload
    NoChange,
    /// We skipped the upload for some reason, such as tenant/timeline not ready
@@ -344,25 +347,12 @@ enum UploadHeatmapError {
    Upload(#[from] anyhow::Error),
 }

-/// Digests describing the heatmap we most recently uploaded successfully.
-///
-/// md5 is generally a bad hash.  We use it because it's convenient for interop with AWS S3's ETag,
-/// which is also an md5sum.
-#[derive(Clone)]
-struct LastUploadState {
-    // Digest of json-encoded HeatMapTenant
-    uploaded_digest: md5::Digest,
-
-    // Digest without atimes set.
-    layers_only_digest: md5::Digest,
-}
-
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
    tenant: &Arc<Tenant>,
-    last_upload: Option<LastUploadState>,
+    last_digest: Option<md5::Digest>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
    debug_assert_current_span_has_tenant_id();

@@ -378,7 +368,6 @@ async fn upload_tenant_heatmap(
    let mut heatmap = HeatMapTenant {
        timelines: Vec::new(),
        generation,
-        upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()),
    };
    let timelines = tenant.timelines.lock().unwrap().clone();

@@ -407,31 +396,15 @@ async fn upload_tenant_heatmap(

    // Serialize the heatmap
    let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
+    let bytes = bytes::Bytes::from(bytes);
+    let size = bytes.len();

    // Drop out early if nothing changed since our last upload
    let digest = md5::compute(&bytes);
-    if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) {
+    if Some(digest) == last_digest {
        return Ok(UploadHeatmapOutcome::NoChange);
    }

-    // Calculate a digest that omits atimes, so that we can distinguish actual changes in
-    // layers from changes only in atimes.
-    let heatmap_size_bytes = heatmap.get_stats().bytes;
-    let layers_only_bytes =
-        serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?;
-    let layers_only_digest = md5::compute(&layers_only_bytes);
-    if heatmap_size_bytes < tenant.get_checkpoint_distance() {
-        // For small tenants, skip upload if only atimes changed. This avoids doing frequent
-        // uploads from long-idle tenants whose atimes are just incremented by periodic
-        // size calculations.
-        if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) {
-            return Ok(UploadHeatmapOutcome::NoChange);
-        }
-    }
-
-    let bytes = bytes::Bytes::from(bytes);
-    let size = bytes.len();
-
    let path = remote_heatmap_path(tenant.get_tenant_shard_id());

    let cancel = &tenant.cancel;
@@ -463,8 +436,5 @@ async fn upload_tenant_heatmap(

    tracing::info!("Successfully uploaded {size} byte heatmap to {path}");

-    Ok(UploadHeatmapOutcome::Uploaded(LastUploadState {
-        uploaded_digest: digest,
-        layers_only_digest,
-    }))
+    Ok(UploadHeatmapOutcome::Uploaded(digest))
 }
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -179,13 +179,6 @@ where
            // Schedule some work, if concurrency limit permits it
            self.spawn_pending();

-            // This message is printed every scheduling iteration as proof of liveness when looking at logs
-            tracing::info!(
-                "Status: {} tasks running, {} pending",
-                self.running.len(),
-                self.pending.len()
-            );
-
            // Between scheduling iterations, we will:
            //  - Drain any complete tasks and spawn pending tasks
            //  - Handle incoming administrative commands
@@ -265,11 +258,7 @@ where

        self.tasks.spawn(fut);

-        let replaced = self.running.insert(tenant_shard_id, in_progress);
-        debug_assert!(replaced.is_none());
-        if replaced.is_some() {
-            tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running")
-        }
+        self.running.insert(tenant_shard_id, in_progress);
    }

    /// For all pending tenants that are elegible for execution, spawn their task.
@@ -279,9 +268,7 @@ where
        while !self.pending.is_empty() && self.running.len() < self.concurrency {
            // unwrap: loop condition includes !is_empty()
            let pending = self.pending.pop_front().unwrap();
-            if !self.running.contains_key(pending.get_tenant_shard_id()) {
-                self.do_spawn(pending);
-            }
+            self.do_spawn(pending);
        }
    }

@@ -334,8 +321,7 @@ where

        let tenant_shard_id = job.get_tenant_shard_id();
        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
-            tracing::info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                           "Command already running, waiting for it");
+            tracing::info!("Command already running, waiting for it");
            barrier
        } else {
            let running = self.spawn_now(job);
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -113,20 +113,12 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
    }
 }

-/// Bag of data accumulated during a vectored get..
+/// Bag of data accumulated during a vectored get
 pub(crate) struct ValuesReconstructState {
-    /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
-    /// should not expect to get anything from this hashmap.
    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
-    /// The keys which are already retrieved
+
    keys_done: KeySpaceRandomAccum,
-
-    /// The keys covered by the image layers
-    keys_with_image_coverage: Option<Range<Key>>,
-
-    // Statistics that are still accessible as a caller of `get_vectored_impl`.
    layers_visited: u32,
-    delta_layers_visited: u32,
 }

 impl ValuesReconstructState {
@@ -134,9 +126,7 @@ impl ValuesReconstructState {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
-            keys_with_image_coverage: None,
            layers_visited: 0,
-            delta_layers_visited: 0,
        }
    }

@@ -150,17 +140,8 @@ impl ValuesReconstructState {
        }
    }

-    pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
+    pub(crate) fn on_layer_visited(&mut self) {
        self.layers_visited += 1;
-        if let ReadableLayer::PersistentLayer(layer) = layer {
-            if layer.layer_desc().is_delta() {
-                self.delta_layers_visited += 1;
-            }
-        }
-    }
-
-    pub(crate) fn get_delta_layers_visited(&self) -> u32 {
-        self.delta_layers_visited
    }

    pub(crate) fn get_layers_visited(&self) -> u32 {
@@ -190,16 +171,6 @@ impl ValuesReconstructState {
        }
    }

-    /// On hitting image layer, we can mark all keys in this range as done, because
-    /// if the image layer does not contain a key, it is deleted/never added.
-    pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
-        let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
-        assert_eq!(
-            prev_val, None,
-            "should consume the keyspace before the next iteration"
-        );
-    }
-
    /// Update the state collected for a given key.
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
@@ -262,12 +233,8 @@ impl ValuesReconstructState {

    /// Returns the key space describing the keys that have
    /// been marked as completed since the last call to this function.
-    /// Returns individual keys done, and the image layer coverage.
-    pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
-        (
-            self.keys_done.consume_keyspace(),
-            self.keys_with_image_coverage.take(),
-        )
+    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
+        self.keys_done.consume_keyspace()
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -47,7 +47,7 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
-use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -158,7 +158,6 @@ pub struct ImageLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

-    key_range: Range<Key>,
    lsn: Lsn,

    file: VirtualFile,
@@ -420,7 +419,6 @@ impl ImageLayerInner {
            file,
            file_id,
            max_vectored_read_bytes,
-            key_range: actual_summary.key_range,
        }))
    }

@@ -473,27 +471,19 @@ impl ImageLayerInner {
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        let reads = self
-            .plan_reads(keyspace, None, ctx)
+            .plan_reads(keyspace, ctx)
            .await
            .map_err(GetVectoredError::Other)?;

        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

-        reconstruct_state.on_image_layer_visited(&self.key_range);
-
        Ok(())
    }

-    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
-    /// and the keys in this layer.
-    ///
-    /// If shard_identity is provided, it will be used to filter keys down to those stored on
-    /// this shard.
    async fn plan_reads(
        &self,
        keyspace: KeySpace,
-        shard_identity: Option<&ShardIdentity>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<VectoredRead>> {
        let mut planner = VectoredReadPlanner::new(
@@ -513,6 +503,7 @@ impl ImageLayerInner {

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;
+
            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

@@ -525,22 +516,12 @@ impl ImageLayerInner {
                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                assert!(key >= range.start);

-                let flag = if let Some(shard_identity) = shard_identity {
-                    if shard_identity.is_key_disposable(&key) {
-                        BlobFlag::Ignore
-                    } else {
-                        BlobFlag::None
-                    }
-                } else {
-                    BlobFlag::None
-                };
-
                if key >= range.end {
                    planner.handle_range_end(offset);
                    range_end_handled = true;
                    break;
                } else {
-                    planner.handle(key, self.lsn, offset, flag);
+                    planner.handle(key, self.lsn, offset, BlobFlag::None);
                }
            }

@@ -553,50 +534,6 @@ impl ImageLayerInner {
        Ok(planner.finish())
    }

-    /// Given a key range, select the parts of that range that should be retained by the ShardIdentity,
-    /// then execute vectored GET operations, passing the results of all read keys into the writer.
-    pub(super) async fn filter(
-        &self,
-        shard_identity: &ShardIdentity,
-        writer: &mut ImageLayerWriter,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<usize> {
-        // Fragment the range into the regions owned by this ShardIdentity
-        let plan = self
-            .plan_reads(
-                KeySpace {
-                    // If asked for the total key space, plan_reads will give us all the keys in the layer
-                    ranges: vec![Key::MIN..Key::MAX],
-                },
-                Some(shard_identity),
-                ctx,
-            )
-            .await?;
-
-        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
-        let mut key_count = 0;
-        for read in plan.into_iter() {
-            let buf_size = read.size();
-
-            let buf = BytesMut::with_capacity(buf_size);
-            let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-
-            let frozen_buf = blobs_buf.buf.freeze();
-
-            for meta in blobs_buf.blobs.iter() {
-                let img_buf = frozen_buf.slice(meta.start..meta.end);
-
-                key_count += 1;
-                writer
-                    .put_image(meta.meta.key, img_buf, ctx)
-                    .await
-                    .context(format!("Storing key {}", meta.meta.key))?;
-            }
-        }
-
-        Ok(key_count)
-    }
-
    async fn do_reads_and_update_state(
        &self,
        reads: Vec<VectoredRead>,
@@ -709,7 +646,7 @@ impl ImageLayerWriterInner {
                lsn,
            },
        );
-        trace!("creating image layer {}", path);
+        info!("new image layer {path}");
        let mut file = {
            VirtualFile::open_with_options(
                &path,
@@ -829,7 +766,7 @@ impl ImageLayerWriterInner {
        // FIXME: why not carry the virtualfile here, it supports renaming?
        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        info!("created image layer {}", layer.local_path());
+        trace!("created image layer {}", layer.local_path());

        Ok(layer)
    }
@@ -914,136 +851,3 @@ impl Drop for ImageLayerWriter {
        }
    }
 }
-
-#[cfg(test)]
-mod test {
-    use bytes::Bytes;
-    use pageserver_api::{
-        key::Key,
-        shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
-    };
-    use utils::{id::TimelineId, lsn::Lsn};
-
-    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
-
-    use super::ImageLayerWriter;
-
-    #[tokio::test]
-    async fn image_layer_rewrite() {
-        let harness = TenantHarness::create("test_image_layer_rewrite").unwrap();
-        let (tenant, ctx) = harness.load().await;
-
-        // The LSN at which we will create an image layer to filter
-        let lsn = Lsn(0xdeadbeef0000);
-
-        let timeline_id = TimelineId::generate();
-        let timeline = tenant
-            .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
-        let range = input_start..input_end;
-
-        // Build an image layer to filter
-        let resident = {
-            let mut writer = ImageLayerWriter::new(
-                harness.conf,
-                timeline_id,
-                harness.tenant_shard_id,
-                &range,
-                lsn,
-                &ctx,
-            )
-            .await
-            .unwrap();
-
-            let foo_img = Bytes::from_static(&[1, 2, 3, 4]);
-            let mut key = range.start;
-            while key < range.end {
-                writer.put_image(key, foo_img.clone(), &ctx).await.unwrap();
-
-                key = key.next();
-            }
-            writer.finish(&timeline, &ctx).await.unwrap()
-        };
-        let original_size = resident.metadata().file_size;
-
-        // Filter for various shards: this exercises cases like values at start of key range, end of key
-        // range, middle of key range.
-        for shard_number in 0..4 {
-            let mut filtered_writer = ImageLayerWriter::new(
-                harness.conf,
-                timeline_id,
-                harness.tenant_shard_id,
-                &range,
-                lsn,
-                &ctx,
-            )
-            .await
-            .unwrap();
-
-            // TenantHarness gave us an unsharded tenant, but we'll use a sharded ShardIdentity
-            // to exercise filter()
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                ShardCount::new(4),
-                ShardStripeSize(0x8000),
-            )
-            .unwrap();
-
-            let wrote_keys = resident
-                .filter(&shard_identity, &mut filtered_writer, &ctx)
-                .await
-                .unwrap();
-            let replacement = if wrote_keys > 0 {
-                Some(filtered_writer.finish(&timeline, &ctx).await.unwrap())
-            } else {
-                None
-            };
-
-            // This exact size and those below will need updating as/when the layer encoding changes, but
-            // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
-
-            match shard_number {
-                0 => {
-                    // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
-                    let replacement = replacement.unwrap();
-
-                    // We should have dropped some of the data
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-
-                    // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
-                }
-                1 => {
-                    // Shard 1 has no keys in our input range
-                    assert_eq!(wrote_keys, 0x0);
-                    assert!(replacement.is_none());
-                }
-                2 => {
-                    // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
-                    let replacement = replacement.unwrap();
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
-                }
-                3 => {
-                    // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
-                    let replacement = replacement.unwrap();
-                    assert!(replacement.metadata().file_size < original_size);
-                    assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
-                }
-                _ => unreachable!(),
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -4,7 +4,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -12,7 +12,7 @@ use std::time::{Duration, SystemTime};
 use tracing::Instrument;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
-use utils::sync::{gate, heavier_once_cell};
+use utils::sync::heavier_once_cell;

 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -23,10 +23,10 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
-use super::image_layer::{self};
+use super::image_layer;
 use super::{
-    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerName, PersistentLayerDesc,
+    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -129,16 +129,19 @@ pub(crate) fn local_layer_path(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    layer_file_name: &LayerName,
-    generation: &Generation,
+    _generation: &Generation,
 ) -> Utf8PathBuf {
    let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id);

-    if generation.is_none() {
-        // Without a generation, we may only use legacy path style
-        timeline_path.join(layer_file_name.to_string())
-    } else {
-        timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
-    }
+    timeline_path.join(layer_file_name.to_string())
+
+    // TODO: switch to enabling new-style layer paths after next release
+    // if generation.is_none() {
+    //     // Without a generation, we may only use legacy path style
+    //     timeline_path.join(layer_file_name.to_string())
+    // } else {
+    //     timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix()))
+    // }
 }

 impl Layer {
@@ -161,7 +164,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size,
+            metadata.file_size(),
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -194,7 +197,7 @@ impl Layer {
            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
-            metadata.file_size,
+            metadata.file_size(),
        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -227,7 +230,7 @@ impl Layer {

        timeline
            .metrics
-            .resident_physical_size_add(metadata.file_size);
+            .resident_physical_size_add(metadata.file_size());

        ResidentLayer { downloaded, owner }
    }
@@ -585,6 +588,9 @@ struct LayerInner {
    /// [`Timeline::gate`] at the same time.
    timeline: Weak<Timeline>,

+    /// Cached knowledge of [`Timeline::remote_client`] being `Some`.
+    have_remote_client: bool,
+
    access_stats: LayerAccessStats,

    /// This custom OnceCell is backed by std mutex, but only held for short time periods.
@@ -729,23 +735,23 @@ impl Drop for LayerInner {
            if removed {
                timeline.metrics.resident_physical_size_sub(file_size);
            }
-            let res = timeline
-                .remote_client
-                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+            if let Some(remote_client) = timeline.remote_client.as_ref() {
+                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);

-            if let Err(e) = res {
-                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                // demonstrating this deadlock (without spawn_blocking): stop will drop
-                // queued items, which will have ResidentLayer's, and those drops would try
-                // to re-entrantly lock the RemoteTimelineClient inner state.
-                if !timeline.is_active() {
-                    tracing::info!("scheduling deletion on drop failed: {e:#}");
+                if let Err(e) = res {
+                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                    // demonstrating this deadlock (without spawn_blocking): stop will drop
+                    // queued items, which will have ResidentLayer's, and those drops would try
+                    // to re-entrantly lock the RemoteTimelineClient inner state.
+                    if !timeline.is_active() {
+                        tracing::info!("scheduling deletion on drop failed: {e:#}");
+                    } else {
+                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
+                    }
+                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
                } else {
-                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
+                    LAYER_IMPL_METRICS.inc_completed_deletes();
                }
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
-            } else {
-                LAYER_IMPL_METRICS.inc_completed_deletes();
            }
        });
    }
@@ -783,6 +789,7 @@ impl LayerInner {
            path: local_path,
            desc,
            timeline: Arc::downgrade(timeline),
+            have_remote_client: timeline.remote_client.is_some(),
            access_stats,
            wanted_deleted: AtomicBool::new(false),
            inner,
@@ -811,6 +818,8 @@ impl LayerInner {
    /// in a new attempt to evict OR join the previously started attempt.
    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))]
    pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
+        assert!(self.have_remote_client);
+
        let mut rx = self.status.as_ref().unwrap().subscribe();

        {
@@ -967,6 +976,10 @@ impl LayerInner {
            return Err(DownloadError::NotFile(ft));
        }

+        if timeline.remote_client.as_ref().is_none() {
+            return Err(DownloadError::NoRemoteStorage);
+        }
+
        if let Some(ctx) = ctx {
            self.check_expected_download(ctx)?;
        }
@@ -1103,12 +1116,15 @@ impl LayerInner {
        permit: heavier_once_cell::InitPermit,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<DownloadedLayer>> {
-        let result = timeline
+        let client = timeline
            .remote_client
+            .as_ref()
+            .expect("checked before download_init_and_wait");
+
+        let result = client
            .download_layer_file(
                &self.desc.layer_name(),
                &self.metadata(),
-                &self.path,
                &timeline.cancel,
                ctx,
            )
@@ -1264,7 +1280,6 @@ impl LayerInner {
                lsn_end: lsn_range.end,
                remote: !resident,
                access_stats,
-                l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
            }
        } else {
            let lsn = self.desc.image_layer_lsn();
@@ -1281,10 +1296,20 @@ impl LayerInner {

    /// `DownloadedLayer` is being dropped, so it calls this method.
    fn on_downloaded_layer_drop(self: Arc<LayerInner>, only_version: usize) {
+        let can_evict = self.have_remote_client;
+
        // we cannot know without inspecting LayerInner::inner if we should evict or not, even
        // though here it is very likely
        let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version);

+        if !can_evict {
+            // it would be nice to assert this case out, but we are in drop
+            span.in_scope(|| {
+                tracing::error!("bug in struct Layer: ResidentOrWantedEvicted has been downgraded while we have no remote storage");
+            });
+            return;
+        }
+
        // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
        // drop while the `self.inner` is being locked, leading to a deadlock.

@@ -1333,7 +1358,7 @@ impl LayerInner {

        is_good_to_continue(&rx.borrow_and_update())?;

-        let Ok(gate) = timeline.gate.enter() else {
+        let Ok(_gate) = timeline.gate.enter() else {
            return Err(EvictionCancelled::TimelineGone);
        };

@@ -1421,7 +1446,7 @@ impl LayerInner {
        Self::spawn_blocking(move || {
            let _span = span.entered();

-            let res = self.evict_blocking(&timeline, &gate, &permit);
+            let res = self.evict_blocking(&timeline, &permit);

            let waiters = self.inner.initializer_count();

@@ -1447,7 +1472,6 @@ impl LayerInner {
    fn evict_blocking(
        &self,
        timeline: &Timeline,
-        _gate: &gate::GateGuard,
        _permit: &heavier_once_cell::InitPermit,
    ) -> Result<(), EvictionCancelled> {
        // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit`
@@ -1557,6 +1581,8 @@ pub(crate) enum EvictionError {
 pub(crate) enum DownloadError {
    #[error("timeline has already shutdown")]
    TimelineShutdown,
+    #[error("no remote storage configured")]
+    NoRemoteStorage,
    #[error("context denies downloading")]
    ContextAndConfigReallyDeniesDownloads,
    #[error("downloading is really required but not allowed by this method")]
@@ -1802,15 +1828,16 @@ impl ResidentLayer {
        use LayerKind::*;

        let owner = &self.owner.0;
+
        match self.downloaded.get(owner, ctx).await? {
            Delta(ref d) => {
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
                owner
                    .access_stats
                    .record_access(LayerAccessKind::KeyIter, ctx);

+                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+                // while it's being held.
                delta_layer::DeltaLayerInner::load_keys(d, ctx)
                    .await
                    .with_context(|| format!("Layer index is corrupted for {self}"))
@@ -1819,23 +1846,6 @@ impl ResidentLayer {
        }
    }

-    /// Read all they keys in this layer which match the ShardIdentity, and write them all to
-    /// the provided writer.  Return the number of keys written.
-    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))]
-    pub(crate) async fn filter<'a>(
-        &'a self,
-        shard_identity: &ShardIdentity,
-        writer: &mut ImageLayerWriter,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<usize> {
-        use LayerKind::*;
-
-        match self.downloaded.get(&self.owner.0, ctx).await? {
-            Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")),
-            Image(i) => i.filter(shard_identity, writer, ctx).await,
-        }
-    }
-
    /// Returns the amount of keys and values written to the writer.
    pub(crate) async fn copy_delta_prefix(
        &self,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -145,7 +145,7 @@ async fn smoke_test() {
        .await
        .expect("the local layer file still exists");

-    let rtc = &timeline.remote_client;
+    let rtc = timeline.remote_client.as_ref().unwrap();

    {
        let layers = &[layer];
@@ -761,7 +761,13 @@ async fn eviction_cancellation_on_drop() {
    timeline.freeze_and_flush().await.unwrap();

    // wait for the upload to complete so our Arc::strong_count assertion holds
-    timeline.remote_client.wait_completion().await.unwrap();
+    timeline
+        .remote_client
+        .as_ref()
+        .unwrap()
+        .wait_completion()
+        .await
+        .unwrap();

    let (evicted_layer, not_evicted) = {
        let mut layers = {
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -347,33 +347,37 @@ impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
 mod test {
    use super::*;
    #[test]
-    fn image_layer_parse() {
+    fn image_layer_parse() -> anyhow::Result<()> {
        let expected = LayerName::Image(ImageLayerName {
            key_range: Key::from_i128(0)
                ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
            lsn: Lsn::from_hex("00000000014FED58").unwrap(),
        });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected,);

        // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected,);
+
+        Ok(())
    }

    #[test]
-    fn delta_layer_parse() {
+    fn delta_layer_parse() -> anyhow::Result<()> {
        let expected = LayerName::Delta(DeltaLayerName {
            key_range: Key::from_i128(0)
                ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(),
            lsn_range: Lsn::from_hex("00000000014FED58").unwrap()
                ..Lsn::from_hex("000000000154C481").unwrap(),
        });
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected);

        // Omitting generation suffix is valid
-        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").map_err(|s| anyhow::anyhow!(s))?;
        assert_eq!(parsed, expected);
+
+        Ok(())
    }
 }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,7 +17,7 @@ use crate::tenant::{Tenant, TenantState};
 use rand::Rng;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{backoff, completion, pausable_failpoint};
+use utils::{backoff, completion};

 static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
    once_cell::sync::Lazy::new(|| {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,10 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;

 use super::layer_manager::LayerManager;
-use super::{
-    CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
-};
+use super::{CompactFlags, DurationRecorder, ImageLayerCreationMode, RecordedDuration, Timeline};

 use anyhow::{anyhow, Context};
 use enumset::EnumSet;
@@ -25,13 +22,14 @@ use tracing::{debug, info, info_span, trace, warn, Instrument};
 use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
-use crate::page_cache;
 use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
-use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
+use crate::tenant::timeline::{drop_rlock, is_rel_fsm_block_key, is_rel_vm_block_key, Hole};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
 use crate::tenant::DeltaLayer;
+use crate::tenant::PageReconstructError;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
+use crate::{page_cache, ZERO_PAGE};

 use crate::keyspace::KeySpace;
 use crate::repository::Key;
@@ -118,13 +116,9 @@ impl Timeline {

                // 3. Create new image layers for partitions that have been modified
                // "enough".
-                let mut partitioning = dense_partitioning;
-                partitioning
-                    .parts
-                    .extend(sparse_partitioning.into_dense().parts);
-                let image_layers = self
+                let dense_layers = self
                    .create_image_layers(
-                        &partitioning,
+                        &dense_partitioning,
                        lsn,
                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
                            ImageLayerCreationMode::Force
@@ -136,8 +130,24 @@ impl Timeline {
                    .await
                    .map_err(anyhow::Error::from)?;

-                self.upload_new_image_layers(image_layers)?;
-                partitioning.parts.len()
+                // For now, nothing will be produced...
+                let sparse_layers = self
+                    .create_image_layers(
+                        &sparse_partitioning.clone().into_dense(),
+                        lsn,
+                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                    )
+                    .await
+                    .map_err(anyhow::Error::from)?;
+                assert!(sparse_layers.is_empty());
+
+                self.upload_new_image_layers(dense_layers)?;
+                dense_partitioning.parts.len()
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -176,24 +186,13 @@ impl Timeline {
    async fn compact_shard_ancestors(
        self: &Arc<Self>,
        rewrite_max: usize,
-        ctx: &RequestContext,
+        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut drop_layers = Vec::new();
-        let mut layers_to_rewrite: Vec<Layer> = Vec::new();
+        let layers_to_rewrite: Vec<Layer> = Vec::new();

-        // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
-        // layer is behind this Lsn, it indicates that the layer is being retained beyond the
-        // pitr_interval, for example because a branchpoint references it.
-        //
-        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
-        // are rewriting layers.
-        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
-
-        tracing::info!(
-            "latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.pitr
-        );
+        // We will use the PITR cutoff as a condition for rewriting layers.
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.pitr;

        let layers = self.layers.read().await;
        for layer_desc in layers.layer_map().iter_historic_layers() {
@@ -252,9 +251,9 @@ impl Timeline {

            // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually
            // without incurring the I/O cost of a rewrite.
-            if layer_desc.get_lsn_range().end >= *latest_gc_cutoff {
-                debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})",
-                    layer_desc.get_lsn_range().end, *latest_gc_cutoff);
+            if layer_desc.get_lsn_range().end >= pitr_cutoff {
+                debug!(%layer, "Skipping rewrite of layer still in PITR window ({} >= {})",
+                    layer_desc.get_lsn_range().end, pitr_cutoff);
                continue;
            }

@@ -264,10 +263,13 @@ impl Timeline {
                continue;
            }

-            // Only rewrite layers if their generations differ.  This guarantees:
-            //  - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one
-            //  - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage
-            if layer.metadata().generation == self.generation {
+            // Only rewrite layers if they would have different remote paths: either they belong to this
+            // shard but an old generation, or they belonged to another shard.  This also implicitly
+            // guarantees that the layer is persistent in remote storage (as only remote persistent
+            // layers are carried across shard splits, any local-only layer would be in the current generation)
+            if layer.metadata().generation == self.generation
+                && layer.metadata().shard.shard_count == self.shard_identity.count
+            {
                debug!(%layer, "Skipping rewrite, is not from old generation");
                continue;
            }
@@ -280,77 +282,26 @@ impl Timeline {
            }

            // Fall through: all our conditions for doing a rewrite passed.
-            layers_to_rewrite.push(layer);
+            // TODO: implement rewriting
+            tracing::debug!(%layer, "Would rewrite layer");
        }

-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop the layers read lock: we will acquire it for write in [`Self::rewrite_layers`]
        drop(layers);

-        let mut replace_image_layers = Vec::new();
-
-        for layer in layers_to_rewrite {
-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
-            let mut image_layer_writer = ImageLayerWriter::new(
-                self.conf,
-                self.timeline_id,
-                self.tenant_shard_id,
-                &layer.layer_desc().key_range,
-                layer.layer_desc().image_layer_lsn(),
-                ctx,
-            )
-            .await?;
-
-            // Safety of layer rewrites:
-            // - We are writing to a different local file path than we are reading from, so the old Layer
-            //   cannot interfere with the new one.
-            // - In the page cache, contents for a particular VirtualFile are stored with a file_id that
-            //   is different for two layers with the same name (in `ImageLayerInner::new` we always
-            //   acquire a fresh id from [`crate::page_cache::next_file_id`].  So readers do not risk
-            //   reading the index from one layer file, and then data blocks from the rewritten layer file.
-            // - Any readers that have a reference to the old layer will keep it alive until they are done
-            //   with it. If they are trying to promote from remote storage, that will fail, but this is the same
-            //   as for compaction generally: compaction is allowed to delete layers that readers might be trying to use.
-            // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are:
-            //    - GC, which at worst witnesses us "undelete" a layer that they just deleted.
-            //    - ingestion, which only inserts layers, therefore cannot collide with us.
-            let resident = layer.download_and_keep_resident().await?;
-
-            let keys_written = resident
-                .filter(&self.shard_identity, &mut image_layer_writer, ctx)
-                .await?;
-
-            if keys_written > 0 {
-                let new_layer = image_layer_writer.finish(self, ctx).await?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
-                    layer.metadata().file_size,
-                    new_layer.metadata().file_size);
-
-                replace_image_layers.push((layer, new_layer));
-            } else {
-                // Drop the old layer.  Usually for this case we would already have noticed that
-                // the layer has no data for us with the ShardedRange check above, but
-                drop_layers.push(layer);
-            }
-        }
-
-        // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
-        // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
-        // to remote index) and be removed. This is inefficient but safe.
-        fail::fail_point!("compact-shard-ancestors-localonly");
+        // TODO: collect layers to rewrite
+        let replace_layers = Vec::new();

        // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage
-        self.rewrite_layers(replace_image_layers, drop_layers)
-            .await?;
+        self.rewrite_layers(replace_layers, drop_layers).await?;

-        fail::fail_point!("compact-shard-ancestors-enqueued");
-
-        // We wait for all uploads to complete before finishing this compaction stage.  This is not
-        // necessary for correctness, but it simplifies testing, and avoids proceeding with another
-        // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
-        // load.
-        self.remote_client.wait_completion().await?;
-
-        fail::fail_point!("compact-shard-ancestors-persistent");
+        if let Some(remote_client) = self.remote_client.as_ref() {
+            // We wait for all uploads to complete before finishing this compaction stage.  This is not
+            // necessary for correctness, but it simplifies testing, and avoids proceeding with another
+            // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
+            // load.
+            remote_client.wait_completion().await?;
+        }

        Ok(())
    }
@@ -550,11 +501,8 @@ impl Timeline {

        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
            if let Some(prev_key) = prev {
-                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
-                // compaction is the gap between data key and metadata keys.
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
-                    && !Key::is_metadata_key(&prev_key)
-                {
+                // just first fast filter
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
                    let key_range = prev_key..next_key;
                    // Measuring hole by just subtraction of i128 representation of key range boundaries
                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
@@ -1213,10 +1161,10 @@ impl TimelineAdaptor {
        lsn: Lsn,
        key_range: &Range<Key>,
        ctx: &RequestContext,
-    ) -> Result<(), CreateImageLayersError> {
+    ) -> Result<(), PageReconstructError> {
        let timer = self.timeline.metrics.create_images_time_histo.start_timer();

-        let image_layer_writer = ImageLayerWriter::new(
+        let mut image_layer_writer = ImageLayerWriter::new(
            self.timeline.conf,
            self.timeline.timeline_id,
            self.timeline.tenant_shard_id,
@@ -1227,34 +1175,47 @@ impl TimelineAdaptor {
        .await?;

        fail_point!("image-layer-writer-fail-before-finish", |_| {
-            Err(CreateImageLayersError::Other(anyhow::anyhow!(
+            Err(PageReconstructError::Other(anyhow::anyhow!(
                "failpoint image-layer-writer-fail-before-finish"
            )))
        });
-
-        let keyspace = KeySpace {
-            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
-        };
-        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
-        let start = Key::MIN;
-        let ImageLayerCreationOutcome {
-            image,
-            next_start_key: _,
-        } = self
-            .timeline
-            .create_image_layer_for_rel_blocks(
-                &keyspace,
-                image_layer_writer,
-                lsn,
-                ctx,
-                key_range.clone(),
-                start,
-            )
-            .await?;
-
-        if let Some(image_layer) = image {
-            self.new_images.push(image_layer);
+        let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
+        for range in &keyspace_ranges {
+            let mut key = range.start;
+            while key < range.end {
+                let img = match self.timeline.get(key, lsn, ctx).await {
+                    Ok(img) => img,
+                    Err(err) => {
+                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                        // page without losing any actual user data. That seems better
+                        // than failing repeatedly and getting stuck.
+                        //
+                        // We had a bug at one point, where we truncated the FSM and VM
+                        // in the pageserver, but the Postgres didn't know about that
+                        // and continued to generate incremental WAL records for pages
+                        // that didn't exist in the pageserver. Trying to replay those
+                        // WAL records failed to find the previous image of the page.
+                        // This special case allows us to recover from that situation.
+                        // See https://github.com/neondatabase/neon/issues/2601.
+                        //
+                        // Unfortunately we cannot do this for the main fork, or for
+                        // any metadata keys, keys, as that would lead to actual data
+                        // loss.
+                        if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
+                            warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
+                            ZERO_PAGE.clone()
+                        } else {
+                            return Err(err);
+                        }
+                    }
+                };
+                image_layer_writer.put_image(key, img, ctx).await?;
+                key = key.next();
+            }
        }
+        let image_layer = image_layer_writer.finish(&self.timeline, ctx).await?;
+
+        self.new_images.push(image_layer);

        timer.stop_and_record();

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -7,7 +7,7 @@ use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
 use tracing::{error, info, instrument, Instrument};
-use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
+use utils::{crashsafe, fs_ext, id::TimelineId};

 use crate::{
    config::PageServerConf,
@@ -26,21 +26,19 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    match timeline
-        .remote_client
-        .persist_index_part_with_deleted_flag()
-        .await
-    {
-        // If we (now, or already) marked it successfully as deleted, we can proceed
-        Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-        // Bail out otherwise
-        //
-        // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-        // two tasks from performing the deletion at the same time. The first task
-        // that starts deletion should run it to completion.
-        Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-        | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-            return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        match remote_client.persist_index_part_with_deleted_flag().await {
+            // If we (now, or already) marked it successfully as deleted, we can proceed
+            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+            // Bail out otherwise
+            //
+            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+            // two tasks from performing the deletion at the same time. The first task
+            // that starts deletion should run it to completion.
+            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+            }
        }
    }
    Ok(())
@@ -119,11 +117,11 @@ pub(super) async fn delete_local_timeline_directory(

 /// Removes remote layers and an index file after them.
 async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    timeline
-        .remote_client
-        .delete_all()
-        .await
-        .context("delete_all")
+    if let Some(remote_client) = &timeline.remote_client {
+        remote_client.delete_all().await.context("delete_all")?
+    };
+
+    Ok(())
 }

 // This function removs remaining traces of a timeline on disk.
@@ -262,7 +260,7 @@ impl DeleteTimelineFlow {
        tenant: Arc<Tenant>,
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
-        remote_client: RemoteTimelineClient,
+        remote_client: Option<RemoteTimelineClient>,
        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
@@ -280,8 +278,6 @@ impl DeleteTimelineFlow {
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
-                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
-                None,
            )
            .context("create_timeline_struct")?;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
-use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
+use utils::{completion, generation::Generation, id::TimelineId, lsn::Lsn};

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum Error {
@@ -41,27 +41,6 @@ pub(crate) enum Error {
    Unexpected(#[source] anyhow::Error),
 }

-impl From<Error> for ApiError {
-    fn from(value: Error) -> Self {
-        match value {
-            e @ Error::NoAncestor => ApiError::Conflict(e.to_string()),
-            // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError?
-            e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)),
-            Error::ShuttingDown => ApiError::ShuttingDown,
-            Error::OtherTimelineDetachOngoing(_) => {
-                ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
-            }
-            // All of these contain shutdown errors, in fact, it's the most common
-            e @ Error::FlushAncestor(_)
-            | e @ Error::RewrittenDeltaDownloadFailed(_)
-            | e @ Error::CopyDeltaPrefix(_)
-            | e @ Error::UploadRewritten(_)
-            | e @ Error::CopyFailed(_)
-            | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()),
-        }
-    }
-}
-
 pub(crate) struct PreparedTimelineDetach {
    layers: Vec<Layer>,
 }
@@ -77,7 +56,7 @@ impl Default for Options {
    fn default() -> Self {
        Self {
            rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(),
-            copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(),
+            copy_concurrency: std::num::NonZeroUsize::new(10).unwrap(),
        }
    }
 }
@@ -91,16 +70,15 @@ pub(super) async fn prepare(
 ) -> Result<(completion::Completion, PreparedTimelineDetach), Error> {
    use Error::*;

+    if detached.remote_client.as_ref().is_none() {
+        unimplemented!("no new code for running without remote storage");
+    }
+
    let Some((ancestor, ancestor_lsn)) = detached
        .ancestor_timeline
        .as_ref()
        .map(|tl| (tl.clone(), detached.ancestor_lsn))
    else {
-        // TODO: check if we have already been detached; for this we need to read the stored data
-        // on remote client, for that we need a follow-up which makes uploads cheaper and maintains
-        // a projection of the commited data.
-        //
-        // the error is wrong per openapi
        return Err(NoAncestor);
    };

@@ -110,7 +88,7 @@ pub(super) async fn prepare(

    if ancestor.ancestor_timeline.is_some() {
        // non-technical requirement; we could flatten N ancestors just as easily but we chose
-        // not to, at least initially
+        // not to
        return Err(TooManyAncestors);
    }

@@ -337,6 +315,8 @@ async fn upload_rewritten_layer(
    // FIXME: better shuttingdown error
    target
        .remote_client
+        .as_ref()
+        .unwrap()
        .upload_layer_file(&copied, cancel)
        .await
        .map_err(UploadRewritten)?;
@@ -426,6 +406,8 @@ async fn remote_copy(
    // FIXME: better shuttingdown error
    adoptee
        .remote_client
+        .as_ref()
+        .unwrap()
        .copy_timeline_layer(adopted, &owned, cancel)
        .await
        .map(move |()| owned)
@@ -439,6 +421,11 @@ pub(super) async fn complete(
    prepared: PreparedTimelineDetach,
    _ctx: &RequestContext,
 ) -> Result<Vec<TimelineId>, anyhow::Error> {
+    let rtc = detached
+        .remote_client
+        .as_ref()
+        .expect("has to have a remote timeline client for timeline ancestor detach");
+
    let PreparedTimelineDetach { layers } = prepared;

    let ancestor = detached
@@ -455,13 +442,11 @@ pub(super) async fn complete(
    //
    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
    // which could give us a completely wrong layer combination.
-    detached
-        .remote_client
-        .schedule_adding_existing_layers_to_index_detach_and_wait(
-            &layers,
-            (ancestor.timeline_id, ancestor_lsn),
-        )
-        .await?;
+    rtc.schedule_adding_existing_layers_to_index_detach_and_wait(
+        &layers,
+        (ancestor.timeline_id, ancestor_lsn),
+    )
+    .await?;

    let mut tasks = tokio::task::JoinSet::new();

@@ -506,6 +491,8 @@ pub(super) async fn complete(
                async move {
                    let res = timeline
                        .remote_client
+                        .as_ref()
+                        .expect("reparented has to have remote client because detached has one")
                        .schedule_reparenting_and_wait(&new_parent)
                        .await;

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -23,7 +23,7 @@ use std::{
 use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, info_span, instrument, warn, Instrument};
+use tracing::{debug, error, info, info_span, instrument, warn, Instrument};

 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -211,6 +211,11 @@ impl Timeline {

        // So, we just need to deal with this.

+        if self.remote_client.is_none() {
+            error!("no remote storage configured, cannot evict layers");
+            return ControlFlow::Continue(());
+        }
+
        let mut js = tokio::task::JoinSet::new();
        {
            let guard = self.layers.read().await;
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -7,28 +7,30 @@ use crate::{
            index::{IndexPart, LayerFileMetadata},
        },
        storage_layer::LayerName,
+        Generation,
    },
+    METADATA_FILE_NAME,
 };
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use std::{
-    collections::{hash_map, HashMap},
-    str::FromStr,
-};
+use pageserver_api::shard::ShardIndex;
+use std::{collections::HashMap, str::FromStr};
 use utils::lsn::Lsn;

 /// Identified files in the timeline directory.
 pub(super) enum Discovered {
    /// The only one we care about
-    Layer(LayerName, LocalLayerFileMetadata),
+    Layer(LayerName, Utf8PathBuf, u64),
    /// Old ephmeral files from previous launches, should be removed
    Ephemeral(String),
    /// Old temporary timeline files, unsure what these really are, should be removed
    Temporary(String),
    /// Temporary on-demand download files, should be removed
    TemporaryDownload(String),
+    /// "metadata" file we persist locally and include in `index_part.json`
+    Metadata,
    /// Backup file from previously future layers
-    IgnoredBackup(Utf8PathBuf),
+    IgnoredBackup,
    /// Unrecognized, warn about these
    Unknown(String),
 }
@@ -44,15 +46,14 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
        let discovered = match LayerName::from_str(&file_name) {
            Ok(file_name) => {
                let file_size = direntry.metadata()?.len();
-                Discovered::Layer(
-                    file_name,
-                    LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size),
-                )
+                Discovered::Layer(file_name, direntry.path().to_owned(), file_size)
            }
            Err(_) => {
-                if file_name.ends_with(".old") {
+                if file_name == METADATA_FILE_NAME {
+                    Discovered::Metadata
+                } else if file_name.ends_with(".old") {
                    // ignore these
-                    Discovered::IgnoredBackup(direntry.path().to_owned())
+                    Discovered::IgnoredBackup
                } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
                    Discovered::TemporaryDownload(file_name)
                } else if is_ephemeral_file(&file_name) {
@@ -75,32 +76,37 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
 /// this structure extends it with metadata describing the layer's presence in local storage.
 #[derive(Clone, Debug)]
 pub(super) struct LocalLayerFileMetadata {
-    pub(super) file_size: u64,
+    pub(super) metadata: LayerFileMetadata,
    pub(super) local_path: Utf8PathBuf,
 }

 impl LocalLayerFileMetadata {
-    pub fn new(local_path: Utf8PathBuf, file_size: u64) -> Self {
+    pub fn new(
+        local_path: Utf8PathBuf,
+        file_size: u64,
+        generation: Generation,
+        shard: ShardIndex,
+    ) -> Self {
        Self {
            local_path,
-            file_size,
+            metadata: LayerFileMetadata::new(file_size, generation, shard),
        }
    }
 }

-/// For a layer that is present in remote metadata, this type describes how to handle
-/// it during startup: it is either Resident (and we have some metadata about a local file),
-/// or it is Evicted (and we only have remote metadata).
+/// Decision on what to do with a layer file after considering its local and remote metadata.
 #[derive(Clone, Debug)]
 pub(super) enum Decision {
    /// The layer is not present locally.
    Evicted(LayerFileMetadata),
-    /// The layer is present locally, and metadata matches: we may hook up this layer to the
-    /// existing file in local storage.
-    Resident {
+    /// The layer is present locally, but local metadata does not match remote; we must
+    /// delete it and treat it as evicted.
+    UseRemote {
        local: LocalLayerFileMetadata,
        remote: LayerFileMetadata,
    },
+    /// The layer is present locally, and metadata matches.
+    UseLocal(LocalLayerFileMetadata),
 }

 /// A layer needs to be left out of the layer map.
@@ -116,81 +122,77 @@ pub(super) enum DismissedLayer {
    /// In order to make crash safe updates to layer map, we must dismiss layers which are only
    /// found locally or not yet included in the remote `index_part.json`.
    LocalOnly(LocalLayerFileMetadata),
-
-    /// The layer exists in remote storage but the local layer's metadata (e.g. file size)
-    /// does not match it
-    BadMetadata(LocalLayerFileMetadata),
 }

 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
-    local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
+    discovered: Vec<(LayerName, Utf8PathBuf, u64)>,
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
+    generation: Generation,
+    shard: ShardIndex,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    let Some(index_part) = index_part else {
-        // If we have no remote metadata, no local layer files are considered valid to load
-        return local_layers
-            .into_iter()
-            .map(|(layer_name, local_metadata)| {
-                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
-            })
-            .collect();
-    };
+    use Decision::*;

-    let mut result = Vec::new();
+    // name => (local_metadata, remote_metadata)
+    type Collected =
+        HashMap<LayerName, (Option<LocalLayerFileMetadata>, Option<LayerFileMetadata>)>;

-    let mut remote_layers = HashMap::new();
+    let mut discovered = discovered
+        .into_iter()
+        .map(|(layer_name, local_path, file_size)| {
+            (
+                layer_name,
+                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
+                // it is not in IndexPart, in which case using our current generation makes sense
+                // because it will be uploaded in this generation.
+                (
+                    Some(LocalLayerFileMetadata::new(
+                        local_path, file_size, generation, shard,
+                    )),
+                    None,
+                ),
+            )
+        })
+        .collect::<Collected>();

-    // Construct Decisions for layers that are found locally, if they're in remote metadata.  Otherwise
-    // construct DismissedLayers to get rid of them.
-    for (layer_name, local_metadata) in local_layers {
-        let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else {
-            result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata))));
-            continue;
-        };
-
-        if remote_metadata.file_size != local_metadata.file_size {
-            result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata))));
-            continue;
-        }
-
-        remote_layers.insert(
-            layer_name,
-            Decision::Resident {
-                local: local_metadata,
-                remote: remote_metadata.clone(),
-            },
-        );
-    }
-
-    // Construct Decision for layers that were not found locally
+    // merge any index_part information, when available
    index_part
-        .layer_metadata
-        .iter()
+        .as_ref()
+        .map(|ip| ip.layer_metadata.iter())
+        .into_iter()
+        .flatten()
+        .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata)))
        .for_each(|(name, metadata)| {
-            if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) {
-                entry.insert(Decision::Evicted(metadata.clone()));
+            if let Some(existing) = discovered.get_mut(name) {
+                existing.1 = Some(metadata);
+            } else {
+                discovered.insert(name.to_owned(), (None, Some(metadata)));
            }
        });

-    // For layers that were found in authoritative remote metadata, apply a final check that they are within
-    // the disk_consistent_lsn.
-    result.extend(remote_layers.into_iter().map(|(name, decision)| {
-        if name.is_in_future(disk_consistent_lsn) {
-            match decision {
-                Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })),
-                Decision::Resident {
-                    local,
-                    remote: _remote,
-                } => (name, Err(DismissedLayer::Future { local: Some(local) })),
-            }
-        } else {
-            (name, Ok(decision))
-        }
-    }));
+    discovered
+        .into_iter()
+        .map(|(name, (local, remote))| {
+            let decision = if name.is_in_future(disk_consistent_lsn) {
+                Err(DismissedLayer::Future { local })
+            } else {
+                match (local, remote) {
+                    (Some(local), Some(remote)) if local.metadata != remote => {
+                        Ok(UseRemote { local, remote })
+                    }
+                    (Some(x), Some(_)) => Ok(UseLocal(x)),
+                    (None, Some(x)) => Ok(Evicted(x)),
+                    (Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
+                    (None, None) => {
+                        unreachable!("there must not be any non-local non-remote files")
+                    }
+                }
+            };

-    result
+            (name, decision)
+        })
+        .collect::<Vec<_>>()
 }

 pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
@@ -199,15 +201,25 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
    std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
 }

-pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> {
-    let local_size = local.file_size;
+pub(super) fn cleanup_local_file_for_remote(
+    local: &LocalLayerFileMetadata,
+    remote: &LayerFileMetadata,
+) -> anyhow::Result<()> {
+    let local_size = local.metadata.file_size();
+    let remote_size = remote.file_size();
    let path = &local.local_path;
-    let file_name = path.file_name().expect("must be file path");
-    tracing::warn!(
-        "removing local file {file_name:?} because it has unexpected length {local_size};"
-    );

-    std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}"))
+    let file_name = path.file_name().expect("must be file path");
+    tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
+    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
+        assert!(
+            path.exists(),
+            "we would leave the local_layer without a file if this does not hold: {path}",
+        );
+        Err(err)
+    } else {
+        Ok(())
+    }
 }

 pub(super) fn cleanup_future_layer(
@@ -229,8 +241,8 @@ pub(super) fn cleanup_local_only_file(
 ) -> anyhow::Result<()> {
    let kind = name.kind();
    tracing::info!(
-        "found local-only {kind} layer {name} size {}",
-        local.file_size
+        "found local-only {kind} layer {name}, metadata {:?}",
+        local.metadata
    );
    std::fs::remove_file(&local.local_path)?;
    Ok(())
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -212,34 +212,13 @@ impl LayerManager {
        &mut self,
        rewrite_layers: &[(Layer, ResidentLayer)],
        drop_layers: &[Layer],
-        metrics: &TimelineMetrics,
+        _metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
-        for (old_layer, new_layer) in rewrite_layers {
-            debug_assert_eq!(
-                old_layer.layer_desc().key_range,
-                new_layer.layer_desc().key_range
-            );
-            debug_assert_eq!(
-                old_layer.layer_desc().lsn_range,
-                new_layer.layer_desc().lsn_range
-            );

-            // Safety: we may never rewrite the same file in-place.  Callers are responsible
-            // for ensuring that they only rewrite layers after something changes the path,
-            // such as an increment in the generation number.
-            assert_ne!(old_layer.local_path(), new_layer.local_path());
+        // TODO: implement rewrites (currently this code path only used for drops)
+        assert!(rewrite_layers.is_empty());

-            Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr);
-
-            Self::insert_historic_layer(
-                new_layer.as_ref().clone(),
-                &mut updates,
-                &mut self.layer_fmgr,
-            );
-
-            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
-        }
        for l in drop_layers {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -705,7 +705,6 @@ impl ConnectionManagerState {
                    commit_lsn: info.commit_lsn,
                    safekeeper_connstr: info.safekeeper_connstr,
                    availability_zone: info.availability_zone,
-                    standby_horizon: info.standby_horizon,
                }
            }
            MessageType::SafekeeperDiscoveryResponse => {
@@ -726,21 +725,6 @@ impl ConnectionManagerState {

        WALRECEIVER_BROKER_UPDATES.inc();

-        trace!(
-            "safekeeper info update: standby_horizon(cutoff)={}",
-            timeline_update.standby_horizon
-        );
-        if timeline_update.standby_horizon != 0 {
-            // ignore reports from safekeepers not connected to replicas
-            self.timeline
-                .standby_horizon
-                .store(Lsn(timeline_update.standby_horizon));
-            self.timeline
-                .metrics
-                .standby_horizon_gauge
-                .set(timeline_update.standby_horizon as i64);
-        }
-
        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
        let old_entry = self.wal_stream_candidates.insert(
            new_safekeeper_id,
@@ -1110,7 +1094,6 @@ mod tests {
                commit_lsn,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
-                standby_horizon: 0,
            },
            latest_update,
        }
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -8,7 +8,6 @@ use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;

 use chrono::NaiveDateTime;
-use pageserver_api::models::AuxFilePolicy;
 use std::sync::Arc;
 use tracing::info;
 use utils::lsn::AtomicLsn;
@@ -61,9 +60,6 @@ pub(crate) struct UploadQueueInitialized {
    /// Part of the flattened "next" `index_part.json`.
    pub(crate) latest_lineage: Lineage,

-    /// The last aux file policy used on this timeline.
-    pub(crate) last_aux_file_policy: Option<AuxFilePolicy>,
-
    /// `disk_consistent_lsn` from the last metadata file that was successfully
    /// uploaded. `Lsn(0)` if nothing was uploaded yet.
    /// Unlike `latest_files` or `latest_metadata`, this value is never ahead.
@@ -193,7 +189,6 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-            last_aux_file_policy: Default::default(),
        };

        *self = UploadQueue::Initialized(state);
@@ -213,7 +208,10 @@ impl UploadQueue {

        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
        for (layer_name, layer_metadata) in &index_part.layer_metadata {
-            files.insert(layer_name.to_owned(), layer_metadata.clone());
+            files.insert(
+                layer_name.to_owned(),
+                LayerFileMetadata::from(layer_metadata),
+            );
        }

        info!(
@@ -241,7 +239,6 @@ impl UploadQueue {
            dangling_files: HashMap::new(),
            shutting_down: false,
            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
-            last_aux_file_policy: index_part.last_aux_file_policy(),
        };

        *self = UploadQueue::Initialized(state);
@@ -319,7 +316,9 @@ impl std::fmt::Display for UploadOp {
                write!(
                    f,
                    "UploadLayer({}, size={:?}, gen={:?})",
-                    layer, metadata.file_size, metadata.generation
+                    layer,
+                    metadata.file_size(),
+                    metadata.generation
                )
            }
            UploadOp::UploadMetadata(_, lsn) => {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -153,7 +153,10 @@ impl PostgresRedoManager {
            process: self
                .redo_process
                .get()
-                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
+                .map(|p| WalRedoManagerProcessStatus {
+                    pid: p.id(),
+                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
+                }),
        }
    }
 }
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,10 +1,7 @@
-/// Layer of indirection previously used to support multiple implementations.
-/// Subject to removal: <https://github.com/neondatabase/neon/issues/7753>
 use std::time::Duration;

 use bytes::Bytes;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use tracing::warn;
 use utils::lsn::Lsn;

 use crate::{config::PageServerConf, walrecord::NeonWalRecord};
@@ -15,6 +12,7 @@ mod protocol;

 mod process_impl {
    pub(super) mod process_async;
+    pub(super) mod process_std;
 }

 #[derive(
@@ -36,7 +34,10 @@ pub enum Kind {
    Async,
 }

-pub(crate) struct Process(process_impl::process_async::WalRedoProcess);
+pub(crate) enum Process {
+    Sync(process_impl::process_std::WalRedoProcess),
+    Async(process_impl::process_async::WalRedoProcess),
+}

 impl Process {
    #[inline(always)]
@@ -45,17 +46,18 @@ impl Process {
        tenant_shard_id: TenantShardId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
-        if conf.walredo_process_kind != Kind::Async {
-            warn!(
-                configured = %conf.walredo_process_kind,
-                "the walredo_process_kind setting has been turned into a no-op, using async implementation"
-            );
-        }
-        Ok(Self(process_impl::process_async::WalRedoProcess::launch(
-            conf,
-            tenant_shard_id,
-            pg_version,
-        )?))
+        Ok(match conf.walredo_process_kind {
+            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
+            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
+        })
    }

    #[inline(always)]
@@ -67,12 +69,29 @@ impl Process {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        self.0
-            .apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
-            .await
+        match self {
+            Process::Sync(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
+            }
+            Process::Async(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
+            }
+        }
    }

    pub(crate) fn id(&self) -> u32 {
-        self.0.id()
+        match self {
+            Process::Sync(p) => p.id(),
+            Process::Async(p) => p.id(),
+        }
+    }
+
+    pub(crate) fn kind(&self) -> Kind {
+        match self {
+            Process::Sync(_) => Kind::Sync,
+            Process::Async(_) => Kind::Async,
+        }
    }
 }
--- a/pageserver/src/walredo/process/process_impl/process_std.rs
+++ b/pageserver/src/walredo/process/process_impl/process_std.rs
@@ -0,0 +1,405 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -1,78 +0,0 @@
-From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
-From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-Date: Fri, 2 Feb 2024 22:26:45 +0200
-Subject: [PATCH 1/1] Make v0.6.0 work with Neon
-
-Now that the WAL-logging happens as a separate step at the end of the
-build, we need a few neon-specific hints to make it work.
---
- src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
- 1 file changed, 36 insertions(+)
-
-diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index 680789b..ec54dea 100644
--- a/src/hnswbuild.c
-+++ b/src/hnswbuild.c
-@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
- 
- 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
-+#endif
-+
- 	/* Perform inserts */
- 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
-+#endif
-+
- 	/* Close relations within worker */
- 	index_close(indexRel, indexLockmode);
- 	table_close(heapRel, heapLockmode);
-@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
- 	SeedRandom(42);
- #endif
- 
-+#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
- 
- 	BuildGraph(buildstate, forkNum);
- 
-+#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
-+#endif
-+
- 	if (RelationNeedsWAL(index))
-+	{
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
- 
-+#ifdef NEON_SMGR
-+		{
-+#if PG_VERSION_NUM >= 160000
-+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
-+#else
-+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
-+#endif
-+
-+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
-+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
-+		}
-+#endif
-+	}
-+
-+#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(RelationGetSmgr(index));
-+#endif
-+
- 	FreeBuildState(buildstate);
- }
- 
-- 
-2.39.2
-
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -49,8 +49,9 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-int         neon_protocol_version = 2;
+int         neon_protocol_version = 1;

+static int	n_reconnect_attempts = 0;
 static int	max_reconnect_attempts = 60;
 static int	stripe_size;

@@ -94,37 +95,18 @@ static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;

-typedef enum PSConnectionState {
-	PS_Disconnected,			/* no connection yet */
-	PS_Connecting_Startup,		/* connection starting up */
-	PS_Connecting_PageStream,	/* negotiating pagestream */ 
-	PS_Connected,				/* connected, pagestream established */
-} PSConnectionState;
-
 /* This backend's per-shard connections */
 typedef struct
 {
-	TimestampTz		last_connect_time; /* read-only debug value */
-	TimestampTz		last_reconnect_time;
-	uint32			delay_us;
-	int				n_reconnect_attempts;
+	PGconn	   *conn;

-	/*---
-	 * Pageserver connection state, i.e.
-	 *	disconnected: conn == NULL, wes == NULL;
-	 *	conn_startup: connection initiated, waiting for connection establishing
-	 *	conn_ps:      PageStream query sent, waiting for confirmation
-	 *	connected:    PageStream established
-	 */
-	PSConnectionState state;
-	PGconn		   *conn;
 	/*---
 	 * WaitEventSet containing:
-	 *	- WL_SOCKET_READABLE on 'conn'
-	 *	- WL_LATCH_SET on MyLatch, and
-	 *	- WL_EXIT_ON_PM_DEATH.
+	 * - WL_SOCKET_READABLE on 'conn'
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
 	 */
-	WaitEventSet   *wes_read;
+	WaitEventSet *wes;
 } PageServer;

 static PageServer page_servers[MAX_SHARDS];
@@ -321,277 +303,119 @@ get_shard_number(BufferTag *tag)
 	return hash % n_shards;
 }

-static inline void
-CLEANUP_AND_DISCONNECT(PageServer *shard) 
-{
-	if (shard->wes_read)
-	{
-		FreeWaitEventSet(shard->wes_read);
-		shard->wes_read = NULL;
-	}
-	if (shard->conn)
-	{
-		PQfinish(shard->conn);
-		shard->conn = NULL;
-	}
-
-	shard->state = PS_Disconnected;
-}
-
-/*
- * Connect to a pageserver, or continue to try to connect if we're yet to
- * complete the connection (e.g. due to receiving an earlier cancellation
- * during connection start).
- * Returns true if successfully connected; false if the connection failed.
- * 
- * Throws errors in unrecoverable situations, or when this backend's query
- * is canceled.
- */
 static bool
 pageserver_connect(shardno_t shard_no, int elevel)
 {
-	PageServer *shard = &page_servers[shard_no];
+	char	   *query;
+	int			ret;
+	const char *keywords[3];
+	const char *values[3];
+	int			n;
+	PGconn	   *conn;
+	WaitEventSet *wes;
 	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];

+	static TimestampTz last_connect_time = 0;
+	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
+	TimestampTz now;
+	uint64_t	us_since_last_connect;
+	bool	broke_from_loop = false;
+
+	Assert(page_servers[shard_no].conn == NULL);
+
 	/*
 	 * Get the connection string for this shard. If the shard map has been
 	 * updated since we last looked, this will also disconnect any existing
 	 * pageserver connections as a side effect.
-	 * Note that connstr is used both during connection start, and when we
-	 * log the successful connection.
 	 */
 	load_shard_map(shard_no, connstr, NULL);

-	switch (shard->state)
+	now = GetCurrentTimestamp();
+	us_since_last_connect = now - last_connect_time;
+	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
 	{
-	case PS_Disconnected:
-	{
-		const char *keywords[3];
-		const char *values[3];
-		int			n_pgsql_params;
-		TimestampTz	now;
-		int64		us_since_last_attempt;
-
-		/* Make sure we start with a clean slate */
-		CLEANUP_AND_DISCONNECT(shard);
-
-		neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected");
-
-		now = GetCurrentTimestamp();
-		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
-		shard->last_reconnect_time = now;
-
-		/*
-		 * If we did other tasks between reconnect attempts, then we won't
-		 * need to wait as long as a full delay.
-		 */
-		if (us_since_last_attempt < shard->delay_us)
-		{
-			pg_usleep(shard->delay_us - us_since_last_attempt);
-		}
-
-		/* update the delay metric */
-		shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC);
-
-		/*
-		 * Connect using the connection string we got from the
-		 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
-		 * variable was set, use that as the password.
-		 *
-		 * The connection options are parsed in the order they're given, so when
-		 * we set the password before the connection string, the connection string
-		 * can override the password from the env variable. Seems useful, although
-		 * we don't currently use that capability anywhere.
-		 */
-		keywords[0] = "dbname";
-		values[0] = connstr;
-		n_pgsql_params = 1;
-
-		if (neon_auth_token)
-		{
-			keywords[1] = "password";
-			values[1] = neon_auth_token;
-			n_pgsql_params++;
-		}
-
-		keywords[n_pgsql_params] = NULL;
-		values[n_pgsql_params] = NULL;
-
-		shard->conn = PQconnectStartParams(keywords, values, 1);
-		if (!shard->conn)
-		{
-			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
-			return false;
-		}
-
-		shard->state = PS_Connecting_Startup;
-		/* fallthrough */
+		pg_usleep(delay_us);
+		delay_us *= 2;
 	}
-	case PS_Connecting_Startup:
+	else
 	{
-		char	   *pagestream_query;
-		int			ps_send_query_ret;
-		bool		connected = false;
-		int poll_result = PGRES_POLLING_WRITING;
-		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup");
+		delay_us = MIN_RECONNECT_INTERVAL_USEC;
+	}

-		do
-		{
-			WaitEvent	event;
+	/*
+	 * Connect using the connection string we got from the
+	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
+	 * variable was set, use that as the password.
+	 *
+	 * The connection options are parsed in the order they're given, so when
+	 * we set the password before the connection string, the connection string
+	 * can override the password from the env variable. Seems useful, although
+	 * we don't currently use that capability anywhere.
+	 */
+	n = 0;
+	if (neon_auth_token)
+	{
+		keywords[n] = "password";
+		values[n] = neon_auth_token;
+		n++;
+	}
+	keywords[n] = "dbname";
+	values[n] = connstr;
+	n++;
+	keywords[n] = NULL;
+	values[n] = NULL;
+	n++;
+	conn = PQconnectdbParams(keywords, values, 1);
+	last_connect_time = GetCurrentTimestamp();

-			switch (poll_result)
-			{
-			default: /* unknown/unused states are handled as a failed connection */
-			case PGRES_POLLING_FAILED:
-				{
-					char	   *pqerr = PQerrorMessage(shard->conn);
-					char	   *msg = NULL;
-					neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED");
+	if (PQstatus(conn) == CONNECTION_BAD)
+	{
+		char	   *msg = pchomp(PQerrorMessage(conn));

-					if (pqerr)
-						msg = pchomp(pqerr);
+		PQfinish(conn);

-					CLEANUP_AND_DISCONNECT(shard);
-
-					if (msg)
-					{
-						neon_shard_log(shard_no, elevel,
-									   "could not connect to pageserver: %s",
-									   msg);
-						pfree(msg);
-					}
-					else
-						neon_shard_log(shard_no, elevel,
-									   "could not connect to pageserver");
-
-					return false;
-				}
-			case PGRES_POLLING_READING:
-				/* Sleep until there's something to do */
-				while (true)
-				{
-					int rc = WaitLatchOrSocket(MyLatch,
-											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
-											   PQsocket(shard->conn),
-											   0,
-											   PG_WAIT_EXTENSION);
-					elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
-					if (rc & WL_LATCH_SET)
-					{
-						ResetLatch(MyLatch);
-						/* query cancellation, backend shutdown */
-						CHECK_FOR_INTERRUPTS();
-					}
-					if (rc & WL_SOCKET_READABLE)
-						break;
-				}
-				/* PQconnectPoll() handles the socket polling state updates */
-
-				break;
-			case PGRES_POLLING_WRITING:
-				/* Sleep until there's something to do */
-				while (true)
-				{
-					int rc = WaitLatchOrSocket(MyLatch,
-											   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
-											   PQsocket(shard->conn),
-											   0,
-											   PG_WAIT_EXTENSION);
-					elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
-					if (rc & WL_LATCH_SET)
-					{
-						ResetLatch(MyLatch);
-						/* query cancellation, backend shutdown */
-						CHECK_FOR_INTERRUPTS();
-					}
-					if (rc & WL_SOCKET_WRITEABLE)
-						break;
-				}
-				/* PQconnectPoll() handles the socket polling state updates */
-
-				break;
-			case PGRES_POLLING_OK:
-				neon_shard_log(shard_no, DEBUG5, "POLLING_OK");
-				connected = true;
-				break;
-			}
-			poll_result = PQconnectPoll(shard->conn);
-			elog(DEBUG5, "PQconnectPoll=>%d", poll_result);
-		}
-		while (!connected);
-
-		/* No more polling needed; connection succeeded */
-		shard->last_connect_time = GetCurrentTimestamp();
-
-		shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3);
-		AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL);
-
-
-		switch (neon_protocol_version)
-		{
+		ereport(elevel,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+				 errdetail_internal("%s", msg)));
+		pfree(msg);
+		return false;
+	}
+	switch (neon_protocol_version)
+	{
 		case 2:
-			pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
+			query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
 			break;
 		case 1:
-			pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
+			query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
 			break;
 		default:
 			elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
-		}
-
-		if (PQstatus(shard->conn) == CONNECTION_BAD)
-		{
-			char	   *msg = pchomp(PQerrorMessage(shard->conn));
-
-			CLEANUP_AND_DISCONNECT(shard);
-
-			ereport(elevel,
-					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-						errdetail_internal("%s", msg)));
-			pfree(msg);
-			return false;
-		}
-
-		ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query);
-		pfree(pagestream_query);
-		if (ps_send_query_ret != 1)
-		{
-			CLEANUP_AND_DISCONNECT(shard);
-
-			neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
-			return false;
-		}
-
-		shard->state = PS_Connecting_PageStream;
-		/* fallthrough */
 	}
-	case PS_Connecting_PageStream:
+	ret = PQsendQuery(conn, query);
+	pfree(query);
+	if (ret != 1)
 	{
-		neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream");
+		PQfinish(conn);
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
+		return false;
+	}

-		if (PQstatus(shard->conn) == CONNECTION_BAD)
-		{
-			char	   *msg = pchomp(PQerrorMessage(shard->conn));
-			CLEANUP_AND_DISCONNECT(shard);
-			ereport(elevel,
-					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
-						errdetail_internal("%s", msg)));
-			pfree(msg);
-			return false;
-		}
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+					  NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);

-		while (PQisBusy(shard->conn))
+	PG_TRY();
+	{
+		while (PQisBusy(conn))
 		{
 			WaitEvent	event;

 			/* Sleep until there's something to do */
-			(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
+			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 			ResetLatch(MyLatch);

 			CHECK_FOR_INTERRUPTS();
@@ -599,37 +423,40 @@ pageserver_connect(shardno_t shard_no, int elevel)
 			/* Data available in socket? */
 			if (event.events & WL_SOCKET_READABLE)
 			{
-				if (!PQconsumeInput(shard->conn))
+				if (!PQconsumeInput(conn))
 				{
-					char	   *msg = pchomp(PQerrorMessage(shard->conn));
+					char	   *msg = pchomp(PQerrorMessage(conn));
+
+					PQfinish(conn);
+					FreeWaitEventSet(wes);

-					CLEANUP_AND_DISCONNECT(shard);
 					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
 								   msg);
-					pfree(msg);
-					return false;
+					/* Returning from inside PG_TRY is bad, so we break/return later */
+					broke_from_loop = true;
+					break;
 				}
 			}
 		}
-
-		shard->state = PS_Connected;
-		/* fallthrough */
 	}
-	case PS_Connected:
-		/*
-		 * We successfully connected. Future connections to this PageServer
-		 * will do fast retries again, with exponential backoff.
-		 */
-		shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
-
-		neon_shard_log(shard_no, DEBUG5, "Connection state: Connected");
-		neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
-		return true;
-	default:
-		neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state);
+	PG_CATCH();
+	{
+		PQfinish(conn);
+		FreeWaitEventSet(wes);
+		PG_RE_THROW();
 	}
-	/* This shouldn't be hit */
-	Assert(false);
+	PG_END_TRY();
+
+	if (broke_from_loop)
+	{
+		return false;
+	}
+
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;
+
+	return true;
 }

 /*
@@ -649,7 +476,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -675,8 +502,7 @@ retry:

 /*
 * Reset prefetch and drop connection to the shard.
- * It also drops connection to all other shards involved in prefetch, through
- * prefetch_on_ps_disconnect().
+ * It also drops connection to all other shards involved in prefetch.
 */
 static void
 pageserver_disconnect(shardno_t shard_no)
@@ -686,6 +512,9 @@ pageserver_disconnect(shardno_t shard_no)
 	 * whole prefetch queue, even for other pageservers. It should not
 	 * cause big problems, because connection loss is supposed to be a
 	 * rare event.
+	 *
+	 * Prefetch state should be reset even if page_servers[shard_no].conn == NULL,
+	 * because prefetch request may be registered before connection is established.
 	 */
 	prefetch_on_ps_disconnect();

@@ -698,36 +527,37 @@ pageserver_disconnect(shardno_t shard_no)
 static void
 pageserver_disconnect_shard(shardno_t shard_no)
 {
-	PageServer *shard = &page_servers[shard_no];
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
 	 * what state the connection is in. For example, if we sent the request
 	 * but didn't receive a response yet, we might receive the response some
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
-	 * Similarly, even when we're in PS_DISCONNECTED, we may have junk to
-	 * clean up: It is possible that we encountered an error allocating any
-	 * of the wait event sets or the psql connection, or failed when we tried
-	 * to attach wait events to the WaitEventSets.
 	 */
-	CLEANUP_AND_DISCONNECT(shard);
-
-	shard->state = PS_Disconnected;
+	if (page_servers[shard_no].conn)
+	{
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;
+	}
+	if (page_servers[shard_no].wes != NULL)
+	{
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
+	}
 }

 static bool
 pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-	PageServer *shard = &page_servers[shard_no];
-	PGconn	   *pageserver_conn;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
 		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect(shard_no);
-		pageserver_conn = NULL;
 	}

 	req_buff = nm_pack_request(request);
@@ -741,19 +571,17 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (shard->state != PS_Connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
-			shard->n_reconnect_attempts += 1;
+			n_reconnect_attempts += 1;
 		}
-		shard->n_reconnect_attempts = 0;
-	} else {
-		Assert(shard->conn != NULL);
+		n_reconnect_attempts = 0;
 	}

-	pageserver_conn = shard->conn;
+	pageserver_conn = page_servers[shard_no].conn;

 	/*
 	 * Send request.
@@ -762,17 +590,13 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * should use async mode and check for interrupts while waiting. In
 	 * practice, our requests are small enough to always fit in the output and
 	 * TCP buffer.
-	 *
-	 * Note that this also will fail when the connection is in the
-	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
-	 * point, but on the grand scheme of things it's only a small issue.
 	 */
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

 		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -787,7 +611,6 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
-
 	return true;
 }

@@ -796,68 +619,58 @@ pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-	PageServer *shard = &page_servers[shard_no];
-	PGconn	   *pageserver_conn = shard->conn;
-	/* read response */
-	int			rc;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (shard->state != PS_Connected)
-	{
-		neon_shard_log(shard_no, LOG,
-					   "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x",
-					   shard->state);
+	if (!pageserver_conn)
 		return NULL;
-	}

-	Assert(pageserver_conn);
-
-	rc = call_PQgetCopyData(shard_no, &resp_buff.data);
-	if (rc >= 0)
+	PG_TRY();
 	{
-		/* call_PQgetCopyData handles rc == 0 */
-		Assert(rc > 0);
+		/* read response */
+		int			rc;

-		PG_TRY();
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
+		if (rc >= 0)
 		{
 			resp_buff.len = rc;
 			resp_buff.cursor = 0;
 			resp = nm_unpack_response(&resp_buff);
 			PQfreemem(resp_buff.data);
+
+			if (message_level_is_interesting(PageStoreTrace))
+			{
+				char	   *msg = nm_to_string((NeonMessage *) resp);
+
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+				pfree(msg);
+			}
 		}
-		PG_CATCH();
+		else if (rc == -1)
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
 			pageserver_disconnect(shard_no);
-			PG_RE_THROW();
+			resp = NULL;
 		}
-		PG_END_TRY();
-
-		if (message_level_is_interesting(PageStoreTrace))
+		else if (rc == -2)
 		{
-			char	   *msg = nm_to_string((NeonMessage *) resp);
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
-			pfree(msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+		}
+		else
+		{
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
-	else if (rc == -1)
+	PG_CATCH();
 	{
-		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
 		pageserver_disconnect(shard_no);
-		resp = NULL;
-	}
-	else if (rc == -2)
-	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
-		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
-	}
-	else
-	{
-		pageserver_disconnect(shard_no);
-		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
+		PG_RE_THROW();
 	}
+	PG_END_TRY();

 	return (NeonResponse *) resp;
 }
@@ -868,7 +681,7 @@ pageserver_flush(shardno_t shard_no)
 {
 	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (page_servers[shard_no].state != PS_Connected)
+	if (!pageserver_conn)
 	{
 		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
@@ -884,7 +697,6 @@ pageserver_flush(shardno_t shard_no)
 			return false;
 		}
 	}
-
 	return true;
 }

@@ -1048,7 +860,7 @@ pg_init_libpagestore(void)
 							"Version of compute<->page server protocol",
 							NULL,
 							&neon_protocol_version,
-							2, /* use protocol version 2 */
+							1, /* default to old protocol for now */
 							1, /* min */
 							2, /* max */
 							PGC_SU_BACKEND,
@@ -1079,7 +891,5 @@ pg_init_libpagestore(void)
 		dbsize_hook = neon_dbsize;
 	}

-	memset(page_servers, 0, sizeof(page_servers));
-
 	lfc_init();
 }
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -237,50 +237,18 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum,
 extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum);

-/*
- * LSN values associated with each request to the pageserver
- */
-typedef struct
-{
-	/*
-	 * 'request_lsn' is the main value that determines which page version to
-	 * fetch.
-	 */
-	XLogRecPtr request_lsn;
-
-	/*
-	 * A hint to the pageserver that the requested page hasn't been modified
-	 * between this LSN and 'request_lsn'. That allows the pageserver to
-	 * return the page faster, without waiting for 'request_lsn' to arrive in
-	 * the pageserver, as long as 'not_modified_since' has arrived.
-	 */
-	XLogRecPtr not_modified_since;
-
-	/*
-	 * 'effective_request_lsn' is not included in the request that's sent to
-	 * the pageserver, but is used to keep track of the latest LSN of when the
-	 * request was made. In a standby server, this is always the same as the
-	 * 'request_lsn', but in the primary we use UINT64_MAX as the
-	 * 'request_lsn' to request the latest page version, so we need this
-	 * separate field to remember that latest LSN was when the request was
-	 * made. It's needed to manage prefetch request, to verify if the response
-	 * to a prefetched request is still valid.
-	 */
-	XLogRecPtr effective_request_lsn;
-} neon_request_lsns;
-
 #if PG_MAJORVERSION_NUM < 16
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  char *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 neon_request_lsns request_lsns, char *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
 #else
 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					  void *buffer);
 extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
-										 neon_request_lsns request_lsns, void *buffer);
+										 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 extern void neon_write(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 #endif
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,7 +45,6 @@
 */
 #include "postgres.h"

-#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -94,10 +93,6 @@ static char *hexdump_page(char *page);

 const int	SmgrTrace = DEBUG5;

-#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
-	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
-				   ##__VA_ARGS__)
-
 page_server_api *page_server;

 /* unlogged relation build states */
@@ -173,7 +168,8 @@ typedef enum PrefetchStatus
 typedef struct PrefetchRequest
 {
 	BufferTag	buftag;			/* must be first entry in the struct */
-	neon_request_lsns request_lsns;
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
 	shardno_t   shard_no;
@@ -275,15 +271,16 @@ static PrefetchState *MyPState;

 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
-static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns);
+static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
+static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
 static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);

-static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno);
-static bool neon_prefetch_response_usable(neon_request_lsns request_lsns,
+static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+								 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
+static bool neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
 										  PrefetchRequest *slot);

 static bool
@@ -341,7 +338,8 @@ compact_prefetch_buffers(void)
 		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
-		target_slot->request_lsns = source_slot->request_lsns;
+		target_slot->request_lsn = source_slot->request_lsn;
+		target_slot->not_modified_since = source_slot->not_modified_since;
 		target_slot->my_ring_index = empty_ring_index;

 		prfh_delete(MyPState->prf_hash, source_slot);
@@ -360,9 +358,8 @@ compact_prefetch_buffers(void)
 		};
 		source_slot->response = NULL;
 		source_slot->my_ring_index = 0;
-		source_slot->request_lsns = (neon_request_lsns) {
-			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
-		};
+		source_slot->request_lsn = InvalidXLogRecPtr;
+		source_slot->not_modified_since = InvalidXLogRecPtr;

 		/* update bookkeeping */
 		n_moved++;
@@ -530,8 +527,6 @@ prefetch_flush_requests(void)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
- * NOTE: callers should make sure they can handle query cancellations in this
- * function's call path.
 */
 static bool
 prefetch_wait_for(uint64 ring_index)
@@ -567,8 +562,6 @@ prefetch_wait_for(uint64 ring_index)
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
- *
- * NOTE: this does IO, and can get canceled out-of-line.
 */
 static bool
 prefetch_read(PrefetchRequest *slot)
@@ -580,14 +573,6 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);

-	if (slot->status != PRFS_REQUESTED ||
-		slot->response != NULL ||
-		slot->my_ring_index != MyPState->ring_receive)
-		neon_shard_log(slot->shard_no, ERROR,
-					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
-					   slot->status, slot->response,
-					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
-
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
@@ -605,11 +590,6 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, WARNING,
-					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long)slot->my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
-					   slot->buftag.forkNum, slot->buftag.blockNum);
 		return false;
 	}
 }
@@ -624,7 +604,6 @@ void
 prefetch_on_ps_disconnect(void)
 {
 	MyPState->ring_flush = MyPState->ring_unused;
-
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
 		PrefetchRequest *slot;
@@ -647,7 +626,6 @@ prefetch_on_ps_disconnect(void)
 		slot->status = PRFS_TAG_REMAINS;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
-
 		prefetch_set_unused(ring_index);
 	}
 }
@@ -711,11 +689,9 @@ prefetch_set_unused(uint64 ring_index)
 * prefetch_wait_for().
 */
 static void
-prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
+prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
 {
 	bool		found;
-	uint64		mySlotNo = slot->my_ring_index;
-
 	NeonGetPageRequest request = {
 		.req.tag = T_NeonGetPageRequest,
 		/* lsn and not_modified_since are filled in below */
@@ -724,25 +700,28 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 		.blkno = slot->buftag.blockNum,
 	};

-	Assert(mySlotNo == MyPState->ring_unused);
+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));

-	if (force_request_lsns)
-		slot->request_lsns = *force_request_lsns;
+	if (force_request_lsn)
+	{
+		request.req.lsn = *force_request_lsn;
+		request.req.not_modified_since = *force_not_modified_since;
+	}
 	else
-		slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
-												   slot->buftag.forkNum,
-												   slot->buftag.blockNum);
-	request.req.lsn = slot->request_lsns.request_lsn;
-	request.req.not_modified_since = slot->request_lsns.not_modified_since;
+	{
+		neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
+							 slot->buftag.forkNum,
+							 slot->buftag.blockNum,
+							 &request.req.lsn,
+							 &request.req.not_modified_since);
+	}
+	slot->request_lsn = request.req.lsn;
+	slot->not_modified_since = request.req.not_modified_since;

 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
-	{
-		Assert(mySlotNo == MyPState->ring_unused);
-		/* loop */
-	}
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -753,6 +732,7 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
+
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -762,22 +742,25 @@ prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns
 *
 * Register that we may want the contents of BufferTag in the near future.
 *
- * If force_request_lsns is not NULL, those values are sent to the
- * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
- * to calculate the LSNs to send.
+ * If force_request_lsn and force_not_modified_since are not NULL, those
+ * values are sent to the pageserver. If they are NULL, we utilize the
+ * lastWrittenLsn -infrastructure to fill them in.
 *
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */

 static uint64
-prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns)
+prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
+						 XLogRecPtr *force_not_modified_since)
 {
 	uint64		ring_index;
 	PrefetchRequest req;
 	PrefetchRequest *slot;
 	PrfHashEntry *entry;

+	Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
+
 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
 Retry:
@@ -798,9 +781,10 @@ Retry:
 		 * If the caller specified a request LSN to use, only accept prefetch
 		 * responses that satisfy that request.
 		 */
-		if (force_request_lsns)
+		if (force_request_lsn)
 		{
-			if (!neon_prefetch_response_usable(*force_request_lsns, slot))
+			if (!neon_prefetch_response_usable(*force_request_lsn,
+											   *force_not_modified_since, slot))
 			{
 				/* Wait for the old request to finish and discard it */
 				if (!prefetch_wait_for(ring_index))
@@ -902,7 +886,7 @@ Retry:
 	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;

-	prefetch_do_request(slot, force_request_lsns);
+	prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(MyPState->ring_last <= ring_index &&
 		   ring_index < MyPState->ring_unused);
@@ -924,10 +908,6 @@ Retry:
 	return ring_index;
 }

-/*
- * Note: this function can get canceled and use a long jump to the next catch
- * context. Take care.
- */
 static NeonResponse *
 page_server_request(void const *req)
 {
@@ -959,38 +939,19 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
-		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
 	{
 		shard_no = 0;
 	}

 	do
 	{
-		PG_TRY();
-		{
-			while (!page_server->send(shard_no, (NeonRequest *) req)
-				   || !page_server->flush(shard_no))
-			{
-				/* do nothing */
-			}
-			consume_prefetch_responses();
-			resp = page_server->receive(shard_no);
-		}
-		PG_CATCH();
-		{
-			/*
-			 * Cancellation in this code needs to be handled better at some
-			 * point, but this currently seems fine for now.
-			 */
-			page_server->disconnect(shard_no);
-			PG_RE_THROW();
-		}
-		PG_END_TRY();
-
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
+		consume_prefetch_responses();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
-
 	return resp;
+
 }


@@ -1402,10 +1363,6 @@ PageIsEmptyHeapPage(char *buffer)
 	return memcmp(buffer, empty_page.data, BLCKSZ) == 0;
 }

-/*
- * A page is being evicted from the shared buffer cache. Update the
- * last-written LSN of the page, and WAL-log it if needed.
- */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1414,7 +1371,12 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 #endif
 {
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
-	bool		log_page;
+
+	if (ShutdownRequestPending)
+		return;
+	/* Don't log any pages if we're not allowed to do so. */
+	if (!XLogInsertAllowed())
+		return;

 	/*
 	 * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM
@@ -1423,21 +1385,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	log_page = false;
-	if (force)
-	{
-		Assert(XLogInsertAllowed());
-		log_page = true;
-	}
-	else if (XLogInsertAllowed() &&
-			 !ShutdownRequestPending &&
-			 (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM))
-	{
-		log_page = true;
-	}
-
-	if (log_page)
+	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
 	{
+		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;

 		recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum,
@@ -1450,8 +1400,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
 	}
-
-	if (lsn == InvalidXLogRecPtr)
+	else if (lsn == InvalidXLogRecPtr)
 	{
 		/*
 		 * When PostgreSQL extends a relation, it calls smgrextend() with an
@@ -1487,31 +1436,19 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
+		else
 		{
-			/*
-			 * Its a bad sign if there is a page with zero LSN in the buffer
-			 * cache in a standby, too. However, PANICing seems like a cure
-			 * worse than the disease, as the damage has likely already been
-			 * done in the primary. So in a standby, make this an assertion,
-			 * and in a release build just LOG the error and soldier on. We
-			 * update the last-written LSN of the page with a conservative
-			 * value in that case, which is the last replayed LSN.
-			 */
-			ereport(RecoveryInProgress() ? LOG : PANIC,
+			ereport(PANIC,
 					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
-			Assert(false);
-
-			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
 	else
 	{
 		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
+				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
 						forknum, LSN_FORMAT_ARGS(lsn))));
@@ -1592,11 +1529,11 @@ nm_adjust_lsn(XLogRecPtr lsn)
 /*
 * Return LSN for requesting pages and number of blocks from page server
 */
-static neon_request_lsns
-neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
+static void
+neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+					 XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
 {
 	XLogRecPtr	last_written_lsn;
-	neon_request_lsns result;

 	last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
 	last_written_lsn = nm_adjust_lsn(last_written_lsn);
@@ -1604,98 +1541,13 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)

 	if (RecoveryInProgress())
 	{
-		/*---
-		 * In broad strokes, a replica always requests the page at the current
-		 * replay LSN. But looking closer, what exactly is the replay LSN? Is
-		 * it the last replayed record, or the record being replayed? And does
-		 * the startup process performing the replay need to do something
-		 * differently than backends running queries? Let's take a closer look
-		 * at the different scenarios:
-		 *
-		 * 1. Startup process reads a page, last_written_lsn is old.
-		 *
-		 * Read the old version of the page. We will apply the WAL record on
-		 * it to bring it up-to-date.
-		 *
-		 * We could read the new version, with the changes from this WAL
-		 * record already applied, to offload the work of replaying the record
-		 * to the pageserver. The pageserver might not have received the WAL
-		 * record yet, though, so a read of the old page version and applying
-		 * the record ourselves is likely faster. Also, the redo function
-		 * might be surprised if the changes have already applied. That's
-		 * normal during crash recovery, but not in hot standby.
-		 *
-		 * 2. Startup process reads a page, last_written_lsn == record we're
-		 *    replaying.
-		 *
-		 * Can this happen? There are a few theoretical cases when it might:
-		 *
-		 * A) The redo function reads the same page twice. We had already read
-		 *    and applied the changes once, and now we're reading it for the
-		 *    second time.  That would be a rather silly thing for a redo
-		 *    function to do, and I'm not aware of any that would do it.
-		 *
-		 * B) The redo function modifies multiple pages, and it already
-		 *    applied the changes to one of the pages, released the lock on
-		 *    it, and is now reading a second page.  Furthermore, the first
-		 *    page was already evicted from the buffer cache, and also from
-		 *    the last-written LSN cache, so that the per-relation or global
-		 *    last-written LSN was already updated. All the WAL redo functions
-		 *    hold the locks on pages that they modify, until all the changes
-		 *    have been modified (?), which would make that impossible.
-		 *    However, we skip the locking, if the page isn't currently in the
-		 *    page cache (see neon_redo_read_buffer_filter below).
-		 *
-		 * Even if the one of the above cases were possible in theory, they
-		 * would also require the pages being modified by the redo function to
-		 * be immediately evicted from the page cache.
-		 *
-		 * So this probably does not happen in practice. But if it does, we
-		 * request the new version, including the changes from the record
-		 * being replayed. That seems like the correct behavior in any case.
-		 *
-		 * 3. Backend process reads a page with old last-written LSN
-		 *
-		 * Nothing special here. Read the old version.
-		 *
-		 * 4. Backend process reads a page with last_written_lsn == record being replayed
-		 *
-		 * This can happen, if the redo function has started to run, and saw
-		 * that the page isn't present in the page cache (see
-		 * neon_redo_read_buffer_filter below).  Normally, in a normal
-		 * Postgres server, the redo function would hold a lock on the page,
-		 * so we would get blocked waiting the redo function to release the
-		 * lock. To emulate that, wait for the WAL replay of the record to
-		 * finish.
-		 */
-		/* Request the page at the end of the last fully replayed LSN. */
-		XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
+		/* Request the page at the last replayed LSN. */
+		*request_lsn = GetXLogReplayRecPtr(NULL);
+		*not_modified_since = last_written_lsn;
+		Assert(last_written_lsn <= *request_lsn);

-		if (last_written_lsn > replay_lsn)
-		{
-			/* GetCurrentReplayRecPtr was introduced in v15 */
-#if PG_VERSION_NUM >= 150000
-			Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL));
-#endif
-
-			/*
-			 * Cases 2 and 4. If this is a backend (case 4), the
-			 * neon_read_at_lsn() call later will wait for the WAL record to be
-			 * fully replayed.
-			 */
-			result.request_lsn = last_written_lsn;
-		}
-		else
-		{
-			/* cases 1 and 3 */
-			result.request_lsn = replay_lsn;
-		}
-		result.not_modified_since = last_written_lsn;
-		result.effective_request_lsn = result.request_lsn;
-		Assert(last_written_lsn <= result.request_lsn);
-
-		neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X",
-				 LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since));
+		neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
+				 LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
 	}
 	else
 	{
@@ -1707,7 +1559,7 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		 * must still in the buffer cache, so our request cannot concern
 		 * those.
 		 */
-		neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X",
+		neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
 				 LSN_FORMAT_ARGS(last_written_lsn));

 		/*
@@ -1733,33 +1585,16 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 		}

 		/*
-		 * Request the very latest version of the page. In principle we
-		 * want to read the page at the current insert LSN, and we could
-		 * use that value in the request. However, there's a corner case
-		 * with pageserver's garbage collection. If the GC horizon is
-		 * set to a very small value, it's possible that by the time
-		 * that the pageserver processes our request, the GC horizon has
-		 * already moved past the LSN we calculate here. Standby servers
-		 * always have that problem as the can always lag behind the
-		 * primary, but for the primary we can avoid it by always
-		 * requesting the latest page, by setting request LSN to
-		 * UINT64_MAX.
-		 *
-		 * Remember the current LSN, however, so that we can later
-		 * correctly determine if the response to the request is still
-		 * valid. The most up-to-date LSN we could use for that purpose
-		 * would be the current insert LSN, but to avoid the overhead of
-		 * looking it up, use 'flushlsn' instead. This relies on the
-		 * assumption that if the page was modified since the last WAL
-		 * flush, it should still be in the buffer cache, and we
-		 * wouldn't be requesting it.
+		 * Request the latest version of the page. The most up-to-date request
+		 * LSN we could use would be the current insert LSN, but to avoid the
+		 * overhead of looking it up, use 'flushlsn' instead. This relies on
+		 * the assumption that if the page was modified since the last WAL
+		 * flush, it should still be in the buffer cache, and we wouldn't be
+		 * requesting it.
 		 */
-		result.request_lsn = UINT64_MAX;
-		result.not_modified_since = last_written_lsn;
-		result.effective_request_lsn = flushlsn;
+		*request_lsn = flushlsn;
+		*not_modified_since = last_written_lsn;
 	}
-
-	return result;
 }

 /*
@@ -1769,16 +1604,12 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
 * satisfy a page read now.
 */
 static bool
-neon_prefetch_response_usable(neon_request_lsns request_lsns,
+neon_prefetch_response_usable(XLogRecPtr request_lsn, XLogRecPtr not_modified_since,
 							  PrefetchRequest *slot)
 {
 	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsns.request_lsn >= request_lsns.not_modified_since);
-	Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since);
-	Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn);
-	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
-	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
-	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
+	Assert(request_lsn >= not_modified_since);
+	Assert(slot->request_lsn >= slot->not_modified_since);
 	Assert(slot->status != PRFS_UNUSED);

 	/*
@@ -1796,40 +1627,26 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 * calculate LSNs "out of order" with each other, but the prefetch queue
 	 * is backend-private at the moment.)
 	 */
-	if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn ||
-		request_lsns.not_modified_since < slot->request_lsns.not_modified_since)
+	if (request_lsn < slot->request_lsn || not_modified_since < slot->not_modified_since)
 	{
 		ereport(LOG,
 				(errcode(ERRCODE_IO_ERROR),
 				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
 				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
-						   LSN_FORMAT_ARGS(request_lsns.not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
-						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
+						   LSN_FORMAT_ARGS(request_lsn), LSN_FORMAT_ARGS(not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsn), LSN_FORMAT_ARGS(slot->not_modified_since))));
 		return false;
 	}

 	/*---
-	 * Each request to the pageserver has three LSN values associated with it:
-	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
-	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
-	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
-	 * we remember `effective_request_lsn` separately. In a primary,
-	 * `effective_request_lsn` is the last flush WAL position when the request
-	 * was sent to the pageserver. That's logically the LSN that we are
-	 * requesting the page at, but we send UINT64_MAX to the pageserver so
-	 * that if the GC horizon advances past that position, we still get a
-	 * valid response instead of an error.
-	 *
-	 * To determine whether a response to a GetPage request issued earlier is
-	 * still valid to satisfy a new page read, we look at the
-	 * (not_modified_since, effective_request_lsn] range of the request. It is
-	 * effectively a claim that the page has not been modified between those
-	 * LSNs.  If the range of the old request in the queue overlaps with the
-	 * new request, we know that the page hasn't been modified in the union of
-	 * the ranges. We can use the response to old request to satisfy the new
-	 * request in that case. For example:
+	 * Each request to the pageserver carries two LSN values:
+	 * `not_modified_since` and `request_lsn`. The (not_modified_since,
+	 * request_lsn] range of each request is effectively a claim that the page
+	 * has not been modified between those LSNs.  If the range of the old
+	 * request in the queue overlaps with the new request, we know that the
+	 * page hasn't been modified in the union of the ranges. We can use the
+	 * response to old request to satisfy the new request in that case. For
+	 * example:
 	 *
 	 *              100      500
 	 * Old request:  +--------+
@@ -1858,9 +1675,9 @@ neon_prefetch_response_usable(neon_request_lsns request_lsns,
 	 */

 	/* this follows from the checks above */
-	Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(request_lsn >= slot->not_modified_since);

-	return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn;
+	return not_modified_since <= slot->request_lsn;
 }

 /*
@@ -1872,7 +1689,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 	bool		exists;
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	neon_request_lsns request_lsns;
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1927,15 +1745,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}

-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonExistsRequest request = {
 			.req.tag = T_NeonExistsRequest,
-			.req.lsn = request_lsns.request_lsn,
-			.req.not_modified_since = request_lsns.not_modified_since,
+			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
-			.forknum = forkNum
-		};
+		.forknum = forkNum};

 		resp = page_server_request(&request);
 	}
@@ -1952,15 +1770,13 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 					 errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
-										T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_exists", resp->tag);
 	}
 	pfree(resp);
 	return exists;
@@ -2319,7 +2135,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

-	ring_index = prefetch_register_buffer(tag, NULL);
+	ring_index = prefetch_register_buffer(tag, NULL, NULL);

 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
@@ -2372,10 +2188,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 neon_request_lsns request_lsns, char *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
 #else
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-				 neon_request_lsns request_lsns, void *buffer)
+				 XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
 #endif
 {
 	NeonResponse *resp;
@@ -2407,18 +2223,18 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	 * value of the LwLsn cache when the entry is not found.
 	 */
 	if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
-		XLogWaitForReplayOf(request_lsns.request_lsn);
+		XLogWaitForReplayOf(request_lsn);

 	/*
 	 * Try to find prefetched page in the list of received pages.
 	 */
-Retry:
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);

 	if (entry != NULL)
 	{
 		slot = entry->slot;
-		if (neon_prefetch_response_usable(request_lsns, slot))
+		if (neon_prefetch_response_usable(request_lsn, not_modified_since, slot))
 		{
 			ring_index = slot->my_ring_index;
 			pgBufferUsage.prefetch.hits += 1;
@@ -2452,7 +2268,8 @@ Retry:
 		{
 			pgBufferUsage.prefetch.misses += 1;

-			ring_index = prefetch_register_buffer(buftag, &request_lsns);
+			ring_index = prefetch_register_buffer(buftag, &request_lsn,
+												  &not_modified_since);
 			slot = GetPrfSlot(ring_index);
 		}
 		else
@@ -2493,14 +2310,12 @@ Retry:
 							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
-							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;
 		default:
-			NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-										"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-										T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_at_lsn", resp->tag);
 	}

 	/* buffer was used, clean up for later reuse */
@@ -2518,7 +2333,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
 neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
 #endif
 {
-	neon_request_lsns request_lsns;
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2543,8 +2359,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 		return;
 	}

-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
+						 &request_lsn, &not_modified_since);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -2713,7 +2530,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
 	NeonResponse *resp;
 	BlockNumber n_blocks;
-	neon_request_lsns request_lsns;
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2740,12 +2558,13 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 		return n_blocks;
 	}

-	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonNblocksRequest request = {
 			.req.tag = T_NeonNblocksRequest,
-			.req.lsn = request_lsns.request_lsn,
-			.req.not_modified_since = request_lsns.not_modified_since,
+			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.rinfo = InfoFromSMgrRel(reln),
 			.forknum = forknum,
 		};
@@ -2765,23 +2584,21 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 					 errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum,
-							LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
-										T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_nblocks", resp->tag);
 	}
 	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);

 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-			 forknum,
-			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
-			 n_blocks);
+		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+		 forknum,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 n_blocks);

 	pfree(resp);
 	return n_blocks;
@@ -2795,15 +2612,17 @@ neon_dbsize(Oid dbNode)
 {
 	NeonResponse *resp;
 	int64		db_size;
-	neon_request_lsns request_lsns;
+	XLogRecPtr	request_lsn,
+				not_modified_since;
 	NRelFileInfo dummy_node = {0};

-	request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
+	neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
+						 &request_lsn, &not_modified_since);
 	{
 		NeonDbSizeRequest request = {
 			.req.tag = T_NeonDbSizeRequest,
-			.req.lsn = request_lsns.request_lsn,
-			.req.not_modified_since = request_lsns.not_modified_since,
+			.req.lsn = request_lsn,
+			.req.not_modified_since = not_modified_since,
 			.dbNode = dbNode,
 		};

@@ -2820,19 +2639,20 @@ neon_dbsize(Oid dbNode)
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
 					 errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X",
-							dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
+							dbNode,
+							(uint32) (request_lsn >> 32), (uint32) request_lsn),
 					 errdetail("page server returned error: %s",
 							   ((NeonErrorResponse *) resp)->message)));
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
-										T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_dbsize", resp->tag);
 	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
+		 dbNode,
+		 (uint32) (request_lsn >> 32), (uint32) request_lsn,
+		 db_size);

 	pfree(resp);
 	return db_size;
@@ -2992,14 +2812,10 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;

 	/*
-	 * Create the local file. In a parallel build, the leader is expected to
-	 * call this first and do it.
-	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	if (!IsParallelWorker())
-		mdcreate(reln, MAIN_FORKNUM, false);
+	mdcreate(reln, MAIN_FORKNUM, false);
 }

 /*
@@ -3023,17 +2839,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

-	/*
-	 * In a parallel build, (only) the leader process performs the 2nd
-	 * phase.
-	 */
-	if (IsParallelWorker())
-	{
-		unlogged_build_rel = NULL;
-		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-	}
-	else
-		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }

 /*
@@ -3091,10 +2897,6 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	XLogRecPtr request_lsn,
 		not_modified_since;

-	/*
-	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
-	 * logic is a bit simpler.
-	 */
 	if (RecoveryInProgress())
 	{
 		request_lsn = GetXLogReplayRecPtr(NULL);
@@ -3106,10 +2908,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			 */
 			request_lsn = GetRedoStartLsn();
 		}
-		request_lsn = nm_adjust_lsn(request_lsn);
 	}
 	else
-		request_lsn = UINT64_MAX;
+		request_lsn = GetXLogInsertRecPtr();
+	request_lsn = nm_adjust_lsn(request_lsn);

 	/*
 	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
@@ -3167,9 +2969,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 			break;

 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
-										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x in neon_read_slru_segment", resp->tag);
 	}
 	pfree(resp);

@@ -3387,7 +3187,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	BufferTag	tag;
 	uint32		hash;
 	LWLock	   *partitionLock;
-	int			buf_id;
+	Buffer		buffer;
 	bool		no_redo_needed;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
@@ -3425,20 +3225,20 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	else
 	{
 		/* Try to find the relevant buffer */
-		buf_id = BufTableLookup(&tag, hash);
+		buffer = BufTableLookup(&tag, hash);

-		no_redo_needed = buf_id < 0;
+		no_redo_needed = buffer < 0;
 	}
+	/* In both cases st lwlsn past this WAL record */
+	SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);

 	/*
 	 * we don't have the buffer in memory, update lwLsn past this record, also
 	 * evict page from file cache
 	 */
 	if (no_redo_needed)
-	{
-		SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
 		lfc_evict(rinfo, forknum, blkno);
-	}
+

 	LWLockRelease(partitionLock);

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1852,30 +1852,34 @@ static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 {
 	hs->ts = 0;
-	hs->xmin = InvalidFullTransactionId;
-	hs->catalog_xmin = InvalidFullTransactionId;
+	hs->xmin.value = ~0;		/* largest unsigned value */
+	hs->catalog_xmin.value = ~0;	/* largest unsigned value */

 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
-
-		if (wp->safekeeper[i].state == SS_ACTIVE)
+		if (wp->safekeeper[i].appendResponse.hs.ts != 0)
 		{
 			HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;

 			if (FullTransactionIdIsNormal(skhs->xmin)
-				&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
+				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
 			{
 				hs->xmin = skhs->xmin;
 				hs->ts = skhs->ts;
 			}
 			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
-				&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
+				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
 			{
 				hs->catalog_xmin = skhs->catalog_xmin;
 				hs->ts = skhs->ts;
 			}
 		}
 	}
+
+	if (hs->xmin.value == ~0)
+		hs->xmin = InvalidFullTransactionId;
+	if (hs->catalog_xmin.value == ~0)
+		hs->catalog_xmin = InvalidFullTransactionId;
 }

 /*
@@ -1942,28 +1946,14 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	}

 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
+	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
-		FullTransactionId xmin = hsFeedback.xmin;
-		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
-		FullTransactionId next_xid = ReadNextFullTransactionId();
-		/*
-		 * Page server is updating nextXid in checkpoint each 1024 transactions,
-		 * so feedback xmin can be actually larger then nextXid and
-		 * function TransactionIdInRecentPast return false in this case,
-		 * preventing update of slot's xmin.
-		 */
-		if (FullTransactionIdPrecedes(next_xid, xmin))
-			xmin = next_xid;
-		if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
-			catalog_xmin = next_xid;
 		agg_hs_feedback = hsFeedback;
-		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(xmin),
-								 EpochFromFullTransactionId(xmin),
-								 XidFromFullTransactionId(catalog_xmin),
-								 EpochFromFullTransactionId(catalog_xmin));
+								 XidFromFullTransactionId(hsFeedback.xmin),
+								 EpochFromFullTransactionId(hsFeedback.xmin),
+								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}

 	CheckGracefulShutdown(wp);
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
 */
 #if PG_MAJORVERSION_NUM < 16
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   neon_request_lsns request_lsns, char *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
 #else
 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-									   neon_request_lsns request_lsns, void *buffer);
+									   XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
 #endif

 static neon_read_at_lsn_type neon_read_at_lsn_ptr;
@@ -298,7 +298,9 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	text	   *relname;
 	text	   *forkname;
 	uint32		blkno;
-	neon_request_lsns	request_lsns;
+
+	XLogRecPtr	request_lsn;
+	XLogRecPtr	not_modified_since;

 	if (PG_NARGS() != 5)
 		elog(ERROR, "unexpected number of arguments in SQL function signature");
@@ -310,15 +312,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	forkname = PG_GETARG_TEXT_PP(1);
 	blkno = PG_GETARG_UINT32(2);

-	request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
-	request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4);
-	/*
-	 * For the time being, use the same LSN for request and
-	 * effective request LSN. If any test needed to use UINT64_MAX
-	 * as the request LSN, we'd need to add effective_request_lsn
-	 * as a new argument.
-	 */
-	request_lsns.effective_request_lsn = request_lsns.request_lsn;
+	request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3);
+	not_modified_since = PG_ARGISNULL(4) ? request_lsn : PG_GETARG_LSN(4);

 	if (!superuser())
 		ereport(ERROR,
@@ -372,8 +367,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
 	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 	raw_page_data = VARDATA(raw_page);

-	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns,
-					 raw_page_data);
+	neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsn, not_modified_since, raw_page_data);

 	relation_close(rel, AccessShareLock);

@@ -419,25 +413,19 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)

 		ForkNumber	forknum = PG_GETARG_UINT32(3);
 		uint32		blkno = PG_GETARG_UINT32(4);
-		neon_request_lsns	request_lsns;
+		XLogRecPtr	request_lsn;
+		XLogRecPtr	not_modified_since;

 		/* Initialize buffer to copy to */
 		bytea	   *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);

-		request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
-		request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6);
-		/*
-		 * For the time being, use the same LSN for request
-		 * and effective request LSN. If any test needed to
-		 * use UINT64_MAX as the request LSN, we'd need to add
-		 * effective_request_lsn as a new argument.
-		 */
-		request_lsns.effective_request_lsn = request_lsns.request_lsn;
+		request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5);
+		not_modified_since = PG_ARGISNULL(6) ? request_lsn : PG_GETARG_LSN(6);

 		SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
 		raw_page_data = VARDATA(raw_page);

-		neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data);
+		neon_read_at_lsn(rinfo, forknum, blkno, request_lsn, not_modified_since, raw_page_data);
 		PG_RETURN_BYTEA_P(raw_page);
 	}
 }
--- a/poetry.lock
+++ b/poetry.lock
@@ -2405,7 +2405,6 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2530,13 +2529,13 @@ files = [

 [[package]]
 name = "requests"
-version = "2.32.0"
+version = "2.31.0"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
-    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
+    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
+    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
 ]

 [package.dependencies]
@@ -2960,16 +2959,6 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3207,4 +3196,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
+content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -9,7 +9,6 @@ default = []
 testing = []

 [dependencies]
-ahash.workspace = true
 anyhow.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
@@ -25,10 +24,8 @@ camino.workspace = true
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
-crossbeam-deque.workspace = true
 dashmap.workspace = true
 env_logger.workspace = true
-framed-websockets.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -38,6 +35,7 @@ hmac.workspace = true
 hostname.workspace = true
 http.workspace = true
 humantime.workspace = true
+hyper-tungstenite.workspace = true
 hyper.workspace = true
 hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
@@ -54,6 +52,7 @@ opentelemetry.workspace = true
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
@@ -77,6 +76,7 @@ smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
 subtle.workspace = true
+sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
@@ -106,8 +106,6 @@ workspace_hack.workspace = true
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
-tokio-tungstenite.workspace = true
-pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
--- a/Show More
+++ b/Show More