untrigger benchmarks

test: wait for attachment service readiness
Update README, include cleanup details (#6816 )
2026-02-16 00:50:36 +00:00 · 2024-02-27 09:49:39 +00:00 · 2024-02-26 19:49:46 +00:00 · 2024-02-26 18:53:48 +01:00 · 2024-02-26 18:17:22 +01:00 · 2024-02-26 17:28:00 +01:00
133 changed files with 4830 additions and 1957 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -39,7 +39,7 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
@@ -76,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.24.0
-        ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90
+        ALLURE_VERSION: 2.27.0
+        ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -19,7 +19,7 @@ runs:
        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${PR_NUMBER}" != "null" ]; then
          BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
          # Shortcut for special branches
          BRANCH_OR_PR=${GITHUB_REF_NAME}
        else
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -16,8 +16,14 @@ concurrency:
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}

 jobs:
-  actionlint:
+  check-permissions:
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
+
+  actionlint:
+    needs: [ check-permissions ]
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -5,6 +5,7 @@ on:
    branches:
      - main
      - release
+      - release-proxy
  pull_request:

 defaults:
@@ -27,24 +28,9 @@ env:
 jobs:
  check-permissions:
    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Disallow PRs from forks
-      if: |
-        github.event_name == 'pull_request' &&
-        github.event.pull_request.head.repo.full_name != github.repository
-
-      run: |
-        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
-          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
-        else
-          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
-        fi
-
-        echo >&2 "We don't run CI for PRs from forks"
-        echo >&2 "${MESSAGE}"
-
-        exit 1
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}

  cancel-previous-e2e-tests:
    needs: [ check-permissions ]
@@ -82,6 +68,8 @@ jobs:
            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
@@ -472,6 +460,7 @@ jobs:
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
+          PAGESERVER_GET_VECTORED_IMPL: vectored

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -696,7 +685,7 @@ jobs:
            })

  trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
    needs: [ check-permissions, promote-images, tag ]
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit
@@ -704,158 +693,173 @@ jobs:
  neon-image:
    needs: [ check-permissions, build-buildtools-image, tag ]
    runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}

    steps:
      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3

-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Kaniko build neon
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile
+          cache-from: type=registry,ref=neondatabase/neon:cache
+          cache-to: type=registry,ref=neondatabase/neon:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+            neondatabase/neon:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ check-permissions, build-buildtools-image, tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
-    defaults:
-      run:
-        shell: sh -eu {0}

    steps:
      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
-
-      - name: Configure ECR and Docker Hub login
-        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
-
-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
-
-      - name: Kaniko build compute tools
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-tools
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
-
-  compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
-    runs-on: [ self-hosted, gen3, large ]
-    container:
-      image: gcr.io/kaniko-project/executor:v1.9.2-debug
-      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
-      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
-      options: --add-host=download.osgeo.org:140.211.15.30
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15, v16 ]
-    defaults:
-      run:
-        shell: sh -eu {0}
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1 # v3 won't work with kaniko
+        uses: actions/checkout@v4
        with:
          submodules: true
          fetch-depth: 0

-      - name: Configure ECR and Docker Hub login
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
        run: |
-          DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64)
-          echo "::add-mask::${DOCKERHUB_AUTH}"
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3

-          cat <<-EOF > /kaniko/.docker/config.json
-            {
-              "auths": {
-                "https://index.docker.io/v1/": {
-                  "auth": "${DOCKERHUB_AUTH}"
-                }
-              },
-              "credHelpers": {
-                "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login"
-              }
-            }
-          EOF
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - name: Kaniko build compute node with extensions
-        run:
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
-                           --context .
-                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg PG_VERSION=${{ matrix.version }}
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
-                           --dockerfile Dockerfile.compute-node
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --cleanup
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
-      - name: Cleanup ECR folder
-        run: rm -rf ~/.ecr
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-tools
+          cache-from: type=registry,ref=neondatabase/compute-tools:cache
+          cache-to: type=registry,ref=neondatabase/compute-tools:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
+  compute-node-image:
+    needs: [ check-permissions, build-buildtools-image, tag ]
+    runs-on: [ self-hosted, gen3, large ]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        version: [ v14, v15, v16 ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+      - uses: docker/setup-buildx-action@v3
+        with:
+          # Disable parallelism for docker buildkit.
+          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
+          config-inline: |
+            [worker.oci]
+              max-parallelism = 1
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - uses: docker/login-action@v3
+        with:
+          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+          password: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+      - uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            PG_VERSION=${{ matrix.version }}
+            BUILD_TAG=${{needs.tag.outputs.build-tag}}
+            TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache
+          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache,mode=max
+          tags: |
+            369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+            neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom

  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
@@ -966,9 +970,7 @@ jobs:
          crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16

      - name: Add latest tag to images
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy'
        run: |
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -980,9 +982,7 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-           github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
        run: |
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
@@ -1006,9 +1006,7 @@ jobs:
          crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}}

      - name: Push latest tags to Docker Hub
-        if: |
-          (github.ref_name == 'main' || github.ref_name == 'release') &&
-          github.event_name != 'workflow_dispatch'
+        if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
        run: |
          crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
@@ -1098,7 +1096,7 @@ jobs:

  deploy:
    needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
-    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
+    if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'

    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1133,14 +1131,28 @@ jobs:
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
+              -f deployStorage=true \
+              -f deployStorageBroker=true \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=true \
+              -f deployProxy=true \
+              -f deployStorage=false \
+              -f deployStorageBroker=false \
+              -f branch=main \
+              -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
          fi

      - name: Create git tag
-        if: github.ref_name == 'release'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
        uses: actions/github-script@v7
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
@@ -1153,6 +1165,7 @@ jobs:
              sha: context.sha,
            })

+      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
      - name: Create GitHub release
        if: github.ref_name == 'release'
        uses: actions/github-script@v7
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -0,0 +1,36 @@
+name: Check Permissions
+
+on:
+  workflow_call:
+    inputs:
+      github-event-name:
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  check-permissions:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Disallow CI runs on PRs from forks
+      if: |
+        inputs.github-event-name  == 'pull_request' &&
+        github.event.pull_request.head.repo.full_name != github.repository
+      run: |
+        if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then
+          MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork"
+        else
+          MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run"
+        fi
+
+        # TODO: use actions/github-script to post this message as a PR comment
+        echo >&2 "We don't run CI for PRs from forks"
+        echo >&2 "${MESSAGE}"
+
+        exit 1
--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -0,0 +1,32 @@
+# A workflow from
+# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries
+
+name: cleanup caches by a branch
+on:
+  pull_request:
+    types:
+      - closed
+
+jobs:
+  cleanup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup
+        run: |
+          gh extension install actions/gh-actions-cache
+
+          echo "Fetching list of cache key"
+          cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 )
+
+          ## Setting this to not fail the workflow while deleting cache keys.
+          set +e
+          echo "Deleting caches..."
+          for cacheKey in $cacheKeysForPR
+          do
+              gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm
+          done
+          echo "Done"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+          BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -20,7 +20,14 @@ env:
  COPT: '-Werror'

 jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name}}
+
  check-macos-build:
+    needs: [ check-permissions ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
@@ -116,8 +123,8 @@ jobs:
        run: ./run_clippy.sh

  check-linux-arm-build:
+    needs: [ check-permissions ]
    timeout-minutes: 90
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    runs-on: [ self-hosted, dev, arm64 ]

    env:
@@ -237,8 +244,8 @@ jobs:
          cargo nextest run --package remote_storage --test test_real_azure

  check-codestyle-rust-arm:
+    needs: [ check-permissions ]
    timeout-minutes: 90
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
    runs-on: [ self-hosted, dev, arm64 ]

    container:
@@ -309,6 +316,7 @@ jobs:
        run: cargo deny check

  gather-rust-build-stats:
+    needs: [ check-permissions ]
    if: |
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,12 +2,31 @@ name: Create Release Branch

 on:
  schedule:
-    - cron: '0 6 * * 1'
+    # It should be kept in sync with if-condition in jobs
+    - cron: '0 6 * * MON' # Storage release
+    - cron: '0 6 * * THU' # Proxy release
  workflow_dispatch:
+    inputs:
+      create-storage-release-branch:
+        type: boolean
+        description: 'Create Storage release PR'
+        required: false
+      create-proxy-release-branch:
+        type: boolean
+        description: 'Create Proxy release PR'
+        required: false
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}

 jobs:
-  create_release_branch:
-    runs-on: [ ubuntu-latest ]
+  create-storage-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    runs-on: ubuntu-latest

    permissions:
      contents: write # for `git push`
@@ -18,27 +37,67 @@ jobs:
      with:
        ref: main

-    - name: Get current date
-      id: date
-      run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV

    - name: Create release branch
-      run: git checkout -b releases/${{ steps.date.outputs.date }}
+      run: git checkout -b $RELEASE_BRANCH

    - name: Push new branch
-      run: git push origin releases/${{ steps.date.outputs.date }}
+      run: git push origin $RELEASE_BRANCH

    - name: Create pull request into release
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
        cat << EOF > body.md
-          ## Release ${{ steps.date.outputs.date }}
+          ## Release ${RELEASE_DATE}

-          **Please merge this PR using 'Create a merge commit'!**
+          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "Release ${{ steps.date.outputs.date }}" \
+        gh pr create --title "Release ${RELEASE_DATE}" \
                     --body-file "body.md" \
-                     --head "releases/${{ steps.date.outputs.date }}" \
+                     --head "${RELEASE_BRANCH}" \
                     --base "release"
+
+  create-proxy-release-branch:
+    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: write # for `git push`
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v4
+      with:
+        ref: main
+
+    - name: Set environment variables
+      run: |
+        echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+        echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV
+
+    - name: Create release branch
+      run: git checkout -b $RELEASE_BRANCH
+
+    - name: Push new branch
+      run: git push origin $RELEASE_BRANCH
+
+    - name: Create pull request into release
+      env:
+        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+      run: |
+        cat << EOF > body.md
+          ## Proxy release ${RELEASE_DATE}
+
+          **Please merge this Pull Request using 'Create a merge commit' button**
+        EOF
+
+        gh pr create --title "Proxy release ${RELEASE_DATE}}" \
+                     --body-file "body.md" \
+                     --head "${RELEASE_BRANCH}" \
+                     --base "release-proxy"
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
            echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
--- a/6
+++ b/6
@@ -1,10 +1,10 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/control_plane/ @neondatabase/compute @neondatabase/storage
-/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
+/libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
-/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
+/libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -284,6 +284,7 @@ dependencies = [
 "diesel_migrations",
 "futures",
 "git-version",
+ "humantime",
 "hyper",
 "metrics",
 "once_cell",
@@ -3551,7 +3552,9 @@ dependencies = [
 "const_format",
 "enum-map",
 "hex",
+ "humantime",
 "humantime-serde",
+ "itertools",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -786,6 +786,22 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_iv
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control

+#########################################################################################
+#
+# Layer "pg_partman"
+# compile pg_partman extension
+#
+#########################################################################################
+FROM build-deps AS pg-partman-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
+    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
+    mkdir pg_partman-src && cd pg_partman-src && tar xvzf ../pg_partman.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control

 #########################################################################################
 #
@@ -829,6 +845,7 @@ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

 ## Quick start
-Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
+Try the [Neon Free Tier](https://neon.tech) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.

 Alternatively, compile and run the project [locally](#running-local-installation).

@@ -230,6 +230,10 @@ postgres=# select * from t;
 > cargo neon stop
 ```

+#### Handling build failures
+
+If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
+
 ## Running tests

 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
@@ -259,6 +263,12 @@ You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or th
 > It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
 > See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).

+## Cleanup
+
+For cleaning up the source tree from build artifacts, run `make clean` in the source directory.
+
+For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directorz will remove your database, with all data in it. You have been warned!
+
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
@@ -53,7 +52,9 @@ use url::Url;

 use compute_api::responses::ComputeStatus;

-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{
+    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
+};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
        info!("synced safekeepers at lsn {lsn}");
    }

+    let mut state = compute.state.lock().unwrap();
+    if state.status == ComputeStatus::TerminationPending {
+        state.status = ComputeStatus::Terminated;
+        compute.state_changed.notify_all();
+        // we were asked to terminate gracefully, don't exit to avoid restart
+        delay_exit = true
+    }
+    drop(state);
+
    if let Err(err) = compute.check_for_core_dumps() {
        error!("error while checking for core dumps: {err:?}");
    }
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
 /// wait for termination which would be easy then.
 fn handle_exit_signal(sig: i32) {
    info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
+    forward_termination_signal();
    exit(1);
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

+use nix::sys::signal::{kill, Signal};
+
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
@@ -1322,3 +1324,17 @@ LIMIT 100",
        Ok(remote_ext_metrics)
    }
 }
+
+pub fn forward_termination_signal() {
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
+    }
+}
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -82,6 +82,12 @@ pub fn write_postgres_conf(
        ComputeMode::Replica => {
            // hot_standby is 'on' by default, but let's be explicit
            writeln!(file, "hot_standby=on")?;
+
+            // Inform the replica about the primary state
+            // Default is 'false'
+            if let Some(primary_is_running) = spec.primary_is_running {
+                writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
+            }
        }
    }

--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;

+use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
        .unwrap()
 }

+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
+        drop(state);
+    }
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become Terminated, current status: {:?}",
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -168,6 +168,29 @@ paths:
              schema:
                $ref: "#/components/schemas/GenericError"

+  /terminate:
+    post:
+      tags:
+      - Terminate
+      summary: Terminate Postgres and wait for it to exit
+      description: ""
+      operationId: terminate
+      responses:
+        200:
+          description: Result
+        412:
+          description: "wrong state"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: "Unexpected error"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
  securitySchemes:
    JWT:
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -655,6 +655,9 @@ pub fn handle_grants(
        // remove this code if possible. The worst thing that could happen is that
        // user won't be able to use public schema in NEW databases created in the
        // very OLD project.
+        //
+        // Also, alter default permissions so that relations created by extensions can be
+        // used by neon_superuser without permission issues.
        let grant_query = "DO $$\n\
                BEGIN\n\
                    IF EXISTS(\n\
@@ -673,6 +676,8 @@ pub fn handle_grants(
                            GRANT CREATE ON SCHEMA public TO web_access;\n\
                        END IF;\n\
                    END IF;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\
+                    ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\
                END\n\
            $$;"
        .to_string();
@@ -777,6 +782,12 @@ BEGIN
 END
 $$;"#,
        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
+        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
+        "",
+        "",
+        "",
+        "",
+        // Add new migrations below.
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
@@ -803,8 +814,13 @@ $$;"#,
    client.simple_query(query)?;

    while current_migration < migrations.len() {
-        info!("Running migration:\n{}\n", migrations[current_migration]);
-        client.simple_query(migrations[current_migration])?;
+        let migration = &migrations[current_migration];
+        if migration.is_empty() {
+            info!("Skip migration id={}", current_migration);
+        } else {
+            info!("Running migration:\n{}\n", migration);
+            client.simple_query(migration)?;
+        }
        current_migration += 1;
    }
    let setval = format!(
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -18,6 +18,7 @@ clap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hyper.workspace = true
+humantime.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -4,7 +4,7 @@ use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use pageserver_api::models::{
    TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TimelineCreateRequest,
+    TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -12,7 +12,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 use utils::auth::SwappableJwtAuth;
 use utils::http::endpoint::{auth_middleware, request_span};
-use utils::http::request::parse_request_param;
+use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

 use utils::{
@@ -66,14 +66,7 @@ fn get_state(request: &Request<Body>) -> &HttpState {
 async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
    let state = get_state(&req);
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .re_attach(reattach_req)
-            .await
-            .map_err(ApiError::InternalServerError)?,
-    )
+    json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?)
 }

 /// Pageserver calls into this before doing deletions, to confirm that it still
@@ -114,7 +107,10 @@ async fn handle_tenant_create(
    mut req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
-    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
+    json_response(
+        StatusCode::CREATED,
+        service.tenant_create(create_req).await?,
+    )
 }

 // For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
@@ -177,6 +173,39 @@ async fn handle_tenant_location_config(
    )
 }

+async fn handle_tenant_time_travel_remote_storage(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let time_travel_req = json_request::<TenantTimeTravelRequest>(&mut req).await?;
+
+    let timestamp_raw = must_get_query_param(&req, "travel_to")?;
+    let _timestamp = humantime::parse_rfc3339(&timestamp_raw).map_err(|_e| {
+        ApiError::BadRequest(anyhow::anyhow!(
+            "Invalid time for travel_to: {timestamp_raw:?}"
+        ))
+    })?;
+
+    let done_if_after_raw = must_get_query_param(&req, "done_if_after")?;
+    let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| {
+        ApiError::BadRequest(anyhow::anyhow!(
+            "Invalid time for done_if_after: {done_if_after_raw:?}"
+        ))
+    })?;
+
+    service
+        .tenant_time_travel_remote_storage(
+            &time_travel_req,
+            tenant_id,
+            timestamp_raw,
+            done_if_after_raw,
+        )
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handle_tenant_delete(
    service: Arc<Service>,
    req: Request<Body>,
@@ -196,7 +225,7 @@ async fn handle_tenant_timeline_create(
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
    json_response(
-        StatusCode::OK,
+        StatusCode::CREATED,
        service
            .tenant_timeline_create(tenant_id, create_req)
            .await?,
@@ -296,7 +325,10 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    }
    let state = get_state(&req);

-    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
+    json_response(
+        StatusCode::OK,
+        state.service.node_configure(config_req).await?,
+    )
 }

 async fn handle_tenant_shard_split(
@@ -474,6 +506,9 @@ pub fn make_router(
        .put("/v1/tenant/:tenant_id/location_config", |r| {
            tenant_service_handler(r, handle_tenant_location_config)
        })
+        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
+        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            tenant_service_handler(r, handle_tenant_timeline_delete)
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -10,7 +10,7 @@ use crate::persistence::NodePersistence;
 ///
 /// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the
 /// implementation of serialization on this type is only for debug dumps.
-#[derive(Clone, Serialize, Eq, PartialEq)]
+#[derive(Clone, Serialize)]
 pub(crate) struct Node {
    pub(crate) id: NodeId,

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -6,7 +6,7 @@ use std::time::Duration;
 use self::split_state::SplitState;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
-use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use control_plane::attachment_service::NodeSchedulingPolicy;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
@@ -130,24 +130,10 @@ impl Persistence {
    }

    /// At startup, populate the list of nodes which our shards may be placed on
-    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<Node>> {
-        let nodes: Vec<Node> = self
+    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
+        let nodes: Vec<NodePersistence> = self
            .with_conn(move |conn| -> DatabaseResult<_> {
-                Ok(crate::schema::nodes::table
-                    .load::<NodePersistence>(conn)?
-                    .into_iter()
-                    .map(|n| Node {
-                        id: NodeId(n.node_id as u64),
-                        // At startup we consider a node offline until proven otherwise.
-                        availability: NodeAvailability::Offline,
-                        scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
-                            .expect("Bad scheduling policy in DB"),
-                        listen_http_addr: n.listen_http_addr,
-                        listen_http_port: n.listen_http_port as u16,
-                        listen_pg_addr: n.listen_pg_addr,
-                        listen_pg_port: n.listen_pg_port as u16,
-                    })
-                    .collect::<Vec<Node>>())
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
            })
            .await?;

@@ -156,6 +142,31 @@ impl Persistence {
        Ok(nodes)
    }

+    pub(crate) async fn update_node(
+        &self,
+        input_node_id: NodeId,
+        input_scheduling: NodeSchedulingPolicy,
+    ) -> DatabaseResult<()> {
+        use crate::schema::nodes::dsl::*;
+        let updated = self
+            .with_conn(move |conn| {
+                let updated = diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
+                    .execute(conn)?;
+                Ok(updated)
+            })
+            .await?;
+
+        if updated != 1 {
+            Err(DatabaseError::Logical(format!(
+                "Node {node_id:?} not found for update",
+            )))
+        } else {
+            Ok(())
+        }
+    }
+
    /// At startup, load the high level state for shards, such as their config + policy.  This will
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
@@ -506,7 +517,7 @@ pub(crate) struct TenantShardPersistence {
 }

 /// Parts of [`crate::node::Node`] that are stored durably
-#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable)]
+#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::nodes)]
 pub(crate) struct NodePersistence {
    pub(crate) node_id: i64,
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -438,7 +438,7 @@ impl Reconciler {
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
-                    tracing::info!("Observed configuration already correct.")
+                    tracing::info!(%node_id, "Observed configuration already correct.")
                }
                _ => {
                    // In all cases other than a matching observed configuration, we will
@@ -449,7 +449,7 @@ impl Reconciler {
                        .increment_generation(self.tenant_shard_id, node_id)
                        .await?;
                    wanted_conf.generation = self.generation.into();
-                    tracing::info!("Observed configuration requires update.");
+                    tracing::info!(%node_id, "Observed configuration requires update.");
                    self.location_config(node_id, wanted_conf, None).await?;
                    self.compute_notify().await?;
                }
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -175,10 +175,34 @@ impl Scheduler {
        }
    }

-    pub(crate) fn schedule_shard(
-        &mut self,
-        hard_exclude: &[NodeId],
-    ) -> Result<NodeId, ScheduleError> {
+    /// Where we have several nodes to choose from, for example when picking a secondary location
+    /// to promote to an attached location, this method may be used to pick the best choice based
+    /// on the scheduler's knowledge of utilization and availability.
+    ///
+    /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the
+    /// caller can pick a node some other way.
+    pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option<NodeId> {
+        if nodes.is_empty() {
+            return None;
+        }
+
+        let node = nodes
+            .iter()
+            .map(|node_id| {
+                let may_schedule = self
+                    .nodes
+                    .get(node_id)
+                    .map(|n| n.may_schedule)
+                    .unwrap_or(false);
+                (*node_id, may_schedule)
+            })
+            .max_by_key(|(_n, may_schedule)| *may_schedule);
+
+        // If even the preferred node has may_schedule==false, return None
+        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
+    }
+
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }
@@ -227,44 +251,45 @@ impl Scheduler {
    }
 }

+#[cfg(test)]
+pub(crate) mod test_utils {
+
+    use crate::node::Node;
+    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+    use std::collections::HashMap;
+    use utils::id::NodeId;
+    /// Test helper: synthesize the requested number of nodes, all in active state.
+    ///
+    /// Node IDs start at one.
+    pub(crate) fn make_test_nodes(n: u64) -> HashMap<NodeId, Node> {
+        (1..n + 1)
+            .map(|i| {
+                (
+                    NodeId(i),
+                    Node {
+                        id: NodeId(i),
+                        availability: NodeAvailability::Active,
+                        scheduling: NodeSchedulingPolicy::Active,
+                        listen_http_addr: format!("httphost-{i}"),
+                        listen_http_port: 80 + i as u16,
+                        listen_pg_addr: format!("pghost-{i}"),
+                        listen_pg_port: 5432 + i as u16,
+                    },
+                )
+            })
+            .collect()
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
-    use std::collections::HashMap;
-
-    use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
    use utils::id::NodeId;

-    use crate::{node::Node, tenant_state::IntentState};
-
+    use crate::tenant_state::IntentState;
    #[test]
    fn scheduler_basic() -> anyhow::Result<()> {
-        let mut nodes = HashMap::new();
-        nodes.insert(
-            NodeId(1),
-            Node {
-                id: NodeId(1),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
-
-        nodes.insert(
-            NodeId(2),
-            Node {
-                id: NodeId(2),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-                listen_http_addr: String::new(),
-                listen_http_port: 0,
-                listen_pg_addr: String::new(),
-                listen_pg_port: 0,
-            },
-        );
+        let nodes = test_utils::make_test_nodes(2);

        let mut scheduler = Scheduler::new(nodes.values());
        let mut t1_intent = IntentState::new();
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -1,4 +1,5 @@
 use std::{
+    borrow::Cow,
    cmp::Ordering,
    collections::{BTreeMap, HashMap, HashSet},
    str::FromStr,
@@ -14,18 +15,18 @@ use control_plane::attachment_service::{
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use diesel::result::DatabaseErrorKind;
-use futures::StreamExt;
+use futures::{stream::FuturesUnordered, StreamExt};
 use hyper::StatusCode;
 use pageserver_api::{
    control_api::{
        ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
        ValidateResponse, ValidateResponseTenant,
    },
-    models,
    models::{
-        LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
-        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
+        self, LocationConfig, LocationConfigListResponse, LocationConfigMode, ShardParameters,
+        TenantConfig, TenantCreateRequest, TenantLocationConfigRequest,
+        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
+        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
 };
@@ -55,6 +56,11 @@ use crate::{
    PlacementPolicy, Sequence,
 };

+// For operations that should be quick, like attaching a new tenant
+const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
+
+// For operations that might be slow, like migrating a tenant with
+// some data in it.
 const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);

 /// How long [`Service::startup_reconcile`] is allowed to take before it should give
@@ -167,84 +173,53 @@ impl Service {
    /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
    /// view of the world, and determine which pageservers are responsive.
    #[instrument(skip_all)]
-    async fn startup_reconcile(&self) {
+    async fn startup_reconcile(self: &Arc<Service>) {
        // For all tenant shards, a vector of observed states on nodes (where None means
        // indeterminate, same as in [`ObservedStateLocation`])
        let mut observed = HashMap::new();

        let mut nodes_online = HashSet::new();

-        // TODO: issue these requests concurrently
-        {
-            let nodes = {
-                let locked = self.inner.read().unwrap();
-                locked.nodes.clone()
-            };
-            for node in nodes.values() {
-                let http_client = reqwest::ClientBuilder::new()
-                    .timeout(Duration::from_secs(5))
-                    .build()
-                    .expect("Failed to construct HTTP client");
-                let client = mgmt_api::Client::from_client(
-                    http_client,
-                    node.base_url(),
-                    self.config.jwt_token.as_deref(),
-                );
+        // Startup reconciliation does I/O to other services: whether they
+        // are responsive or not, we should aim to finish within our deadline, because:
+        // - If we don't, a k8s readiness hook watching /ready will kill us.
+        // - While we're waiting for startup reconciliation, we are not fully
+        //   available for end user operations like creating/deleting tenants and timelines.
+        //
+        // We set multiple deadlines to break up the time available between the phases of work: this is
+        // arbitrary, but avoids a situation where the first phase could burn our entire timeout period.
+        let start_at = Instant::now();
+        let node_scan_deadline = start_at
+            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
+            .expect("Reconcile timeout is a modest constant");

-                fn is_fatal(e: &mgmt_api::Error) -> bool {
-                    use mgmt_api::Error::*;
-                    match e {
-                        ReceiveBody(_) | ReceiveErrorBody(_) => false,
-                        ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
-                        | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
-                        | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
-                        ApiError(_, _) => true,
-                    }
-                }
+        let compute_notify_deadline = start_at
+            .checked_add((STARTUP_RECONCILE_TIMEOUT / 4) * 3)
+            .expect("Reconcile timeout is a modest constant");

-                let list_response = backoff::retry(
-                    || client.list_location_config(),
-                    is_fatal,
-                    1,
-                    5,
-                    "Location config listing",
-                    &self.cancel,
-                )
-                .await;
-                let Some(list_response) = list_response else {
-                    tracing::info!("Shutdown during startup_reconcile");
-                    return;
-                };
+        // Accumulate a list of any tenant locations that ought to be detached
+        let mut cleanup = Vec::new();

-                tracing::info!("Scanning shards on node {}...", node.id);
-                match list_response {
-                    Err(e) => {
-                        tracing::warn!("Could not contact pageserver {} ({e})", node.id);
-                        // TODO: be more tolerant, do some retries, in case
-                        // pageserver is being restarted at the same time as we are
-                    }
-                    Ok(listing) => {
-                        tracing::info!(
-                            "Received {} shard statuses from pageserver {}, setting it to Active",
-                            listing.tenant_shards.len(),
-                            node.id
-                        );
-                        nodes_online.insert(node.id);
+        let node_listings = self.scan_node_locations(node_scan_deadline).await;
+        for (node_id, list_response) in node_listings {
+            let tenant_shards = list_response.tenant_shards;
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}, setting it to Active",
+                tenant_shards.len(),
+                node_id
+            );
+            nodes_online.insert(node_id);

-                        for (tenant_shard_id, conf_opt) in listing.tenant_shards {
-                            observed.insert(tenant_shard_id, (node.id, conf_opt));
-                        }
-                    }
-                }
+            for (tenant_shard_id, conf_opt) in tenant_shards {
+                observed.insert(tenant_shard_id, (node_id, conf_opt));
            }
        }

-        let mut cleanup = Vec::new();
-
+        // List of tenants for which we will attempt to notify compute of their location at startup
        let mut compute_notifications = Vec::new();

        // Populate intent and observed states for all tenants, based on reported state on pageservers
-        let (shard_count, nodes) = {
+        let shard_count = {
            let mut locked = self.inner.write().unwrap();
            let (nodes, tenants, scheduler) = locked.parts_mut();

@@ -288,18 +263,171 @@ impl Service {
                }
            }

-            (tenants.len(), nodes.clone())
+            tenants.len()
        };

        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
        // generation_pageserver in the database.

-        // Clean up any tenants that were found on pageservers but are not known to us.
+        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
+        // will emit compute hook notifications when they reconcile.
+        //
+        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
+        // tenants named here, because otherwise our calls to notify() might race with more recent values
+        // generated by reconciliation.
+        let notify_failures = self
+            .compute_notify_many(compute_notifications, compute_notify_deadline)
+            .await;
+
+        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
+        // flag on these shards that they have a pending notification.
+        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for tenant_shard_id in notify_failures.into_iter() {
+                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
+                    shard.pending_compute_notification = true;
+                }
+            }
+        }
+
+        // Finally, now that the service is up and running, launch reconcile operations for any tenants
+        // which require it: under normal circumstances this should only include tenants that were in some
+        // transient state before we restarted, or any tenants whose compute hooks failed above.
+        let reconcile_tasks = self.reconcile_all();
+        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
+        // normal operations may proceed.
+
+        // Clean up any tenants that were found on pageservers but are not known to us.  Do this in the
+        // background because it does not need to complete in order to proceed with other work.
+        if !cleanup.is_empty() {
+            tracing::info!("Cleaning up {} locations in the background", cleanup.len());
+            tokio::task::spawn({
+                let cleanup_self = self.clone();
+                async move { cleanup_self.cleanup_locations(cleanup).await }
+            });
+        }
+
+        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+    }
+
+    /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline.
+    ///
+    /// The result includes only nodes which responded within the deadline
+    async fn scan_node_locations(
+        &self,
+        deadline: Instant,
+    ) -> HashMap<NodeId, LocationConfigListResponse> {
+        let nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+
+        let mut node_results = HashMap::new();
+
+        let mut node_list_futs = FuturesUnordered::new();
+
+        for node in nodes.values() {
+            node_list_futs.push({
+                async move {
+                    let http_client = reqwest::ClientBuilder::new()
+                        .timeout(Duration::from_secs(5))
+                        .build()
+                        .expect("Failed to construct HTTP client");
+                    let client = mgmt_api::Client::from_client(
+                        http_client,
+                        node.base_url(),
+                        self.config.jwt_token.as_deref(),
+                    );
+
+                    fn is_fatal(e: &mgmt_api::Error) -> bool {
+                        use mgmt_api::Error::*;
+                        match e {
+                            ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                            ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
+                            | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
+                            | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
+                            ApiError(_, _) => true,
+                        }
+                    }
+
+                    tracing::info!("Scanning shards on node {}...", node.id);
+                    let description = format!("List locations on {}", node.id);
+                    let response = backoff::retry(
+                        || client.list_location_config(),
+                        is_fatal,
+                        1,
+                        5,
+                        &description,
+                        &self.cancel,
+                    )
+                    .await;
+
+                    (node.id, response)
+                }
+            });
+        }
+
+        loop {
+            let (node_id, result) = tokio::select! {
+                next = node_list_futs.next() => {
+                    match next {
+                        Some(result) => result,
+                        None =>{
+                            // We got results for all our nodes
+                            break;
+                        }
+
+                    }
+                },
+                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                    // Give up waiting for anyone who hasn't responded: we will yield the results that we have
+                    tracing::info!("Reached deadline while waiting for nodes to respond to location listing requests");
+                    break;
+                }
+            };
+
+            let Some(list_response) = result else {
+                tracing::info!("Shutdown during startup_reconcile");
+                break;
+            };
+
+            match list_response {
+                Err(e) => {
+                    tracing::warn!("Could not scan node {} ({e})", node_id);
+                }
+                Ok(listing) => {
+                    node_results.insert(node_id, listing);
+                }
+            }
+        }
+
+        node_results
+    }
+
+    /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
+    ///
+    /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
+    /// tenants, then it is probably something incompletely deleted before: we will not fight with any
+    /// other task trying to attach it.
+    #[instrument(skip_all)]
+    async fn cleanup_locations(&self, cleanup: Vec<(TenantShardId, NodeId)>) {
+        let nodes = self.inner.read().unwrap().nodes.clone();
+
        for (tenant_shard_id, node_id) in cleanup {
            // A node reported a tenant_shard_id which is unknown to us: detach it.
-            let node = nodes
-                .get(&node_id)
-                .expect("Always exists: only known nodes are scanned");
+            let Some(node) = nodes.get(&node_id) else {
+                // This is legitimate; we run in the background and [`Self::startup_reconcile`] might have identified
+                // a location to clean up on a node that has since been removed.
+                tracing::info!(
+                    "Not cleaning up location {node_id}/{tenant_shard_id}: node not found"
+                );
+                continue;
+            };
+
+            if self.cancel.is_cancelled() {
+                break;
+            }

            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
            match client
@@ -332,58 +460,71 @@ impl Service {
                }
            }
        }
+    }

-        // Emit compute hook notifications for all tenants which are already stably attached.  Other tenants
-        // will emit compute hook notifications when they reconcile.
-        //
-        // Ordering: we must complete these notification attempts before doing any other reconciliation for the
-        // tenants named here, because otherwise our calls to notify() might race with more recent values
-        // generated by reconciliation.
-
-        // Compute notify is fallible.  If it fails here, do not delay overall startup: set the
-        // flag on these shards that they have a pending notification.
+    /// Used during [`Self::startup_reconcile`]: issue many concurrent compute notifications.
+    ///
+    /// Returns a set of any shards for which notifications where not acked within the deadline.
+    async fn compute_notify_many(
+        &self,
+        notifications: Vec<(TenantShardId, NodeId)>,
+        deadline: Instant,
+    ) -> HashSet<TenantShardId> {
        let compute_hook = self.inner.read().unwrap().compute_hook.clone();

+        let attempt_shards = notifications.iter().map(|i| i.0).collect::<HashSet<_>>();
+        let mut success_shards = HashSet::new();
+
        // Construct an async stream of futures to invoke the compute notify function: we do this
        // in order to subsequently use .buffered() on the stream to execute with bounded parallelism.
-        let stream = futures::stream::iter(compute_notifications.into_iter())
+        let mut stream = futures::stream::iter(notifications.into_iter())
            .map(|(tenant_shard_id, node_id)| {
                let compute_hook = compute_hook.clone();
                let cancel = self.cancel.clone();
                async move {
                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                        tracing::error!(
-                            tenant_shard_id=%tenant_shard_id,
-                            node_id=%node_id,
+                            %tenant_shard_id,
+                            %node_id,
                            "Failed to notify compute on startup for shard: {e}"
                        );
-                        Some(tenant_shard_id)
-                    } else {
                        None
+                    } else {
+                        Some(tenant_shard_id)
                    }
                }
            })
            .buffered(compute_hook::API_CONCURRENCY);
-        let notify_results = stream.collect::<Vec<_>>().await;

-        // Update tenant state for any that failed to do their initial compute notify, so that they'll retry later.
-        {
-            let mut locked = self.inner.write().unwrap();
-            for tenant_shard_id in notify_results.into_iter().flatten() {
-                if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) {
-                    shard.pending_compute_notification = true;
+        loop {
+            tokio::select! {
+                next = stream.next() => {
+                    match next {
+                        Some(Some(success_shard)) => {
+                            // A notification succeeded
+                            success_shards.insert(success_shard);
+                            },
+                        Some(None) => {
+                            // A notification that failed
+                        },
+                        None => {
+                            tracing::info!("Successfully sent all compute notifications");
+                            break;
+                        }
+                    }
+                },
+                _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                    // Give up sending any that didn't succeed yet
+                    tracing::info!("Reached deadline while sending compute notifications");
+                    break;
                }
-            }
+            };
        }

-        // Finally, now that the service is up and running, launch reconcile operations for any tenants
-        // which require it: under normal circumstances this should only include tenants that were in some
-        // transient state before we restarted, or any tenants whose compute hooks failed above.
-        let reconcile_tasks = self.reconcile_all();
-        // We will not wait for these reconciliation tasks to run here: we're now done with startup and
-        // normal operations may proceed.
-
-        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
+        attempt_shards
+            .difference(&success_shards)
+            .cloned()
+            .collect()
    }

    /// Long running background task that periodically wakes up and looks for shards that need
@@ -481,7 +622,22 @@ impl Service {
        let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();

        tracing::info!("Loading nodes from database...");
-        let nodes = persistence.list_nodes().await?;
+        let nodes = persistence
+            .list_nodes()
+            .await?
+            .into_iter()
+            .map(|n| Node {
+                id: NodeId(n.node_id as u64),
+                // At startup we consider a node offline until proven otherwise.
+                availability: NodeAvailability::Offline,
+                scheduling: NodeSchedulingPolicy::from_str(&n.scheduling_policy)
+                    .expect("Bad scheduling policy in DB"),
+                listen_http_addr: n.listen_http_addr,
+                listen_http_port: n.listen_http_port as u16,
+                listen_pg_addr: n.listen_pg_addr,
+                listen_pg_port: n.listen_pg_port as u16,
+            })
+            .collect::<Vec<_>>();
        let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.id, n)).collect();
        tracing::info!("Loaded {} nodes from database.", nodes.len());

@@ -773,7 +929,16 @@ impl Service {
    pub(crate) async fn re_attach(
        &self,
        reattach_req: ReAttachRequest,
-    ) -> anyhow::Result<ReAttachResponse> {
+    ) -> Result<ReAttachResponse, ApiError> {
+        // Take a re-attach as indication that the node is available: this is a precursor to proper
+        // heartbeating in https://github.com/neondatabase/neon/issues/6844
+        self.node_configure(NodeConfigureRequest {
+            node_id: reattach_req.node_id,
+            availability: Some(NodeAvailability::Active),
+            scheduling: None,
+        })
+        .await?;
+
        // Ordering: we must persist generation number updates before making them visible in the in-memory state
        let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?;

@@ -864,6 +1029,16 @@ impl Service {
        &self,
        create_req: TenantCreateRequest,
    ) -> Result<TenantCreateResponse, ApiError> {
+        let (response, waiters) = self.do_tenant_create(create_req).await?;
+
+        self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
+        Ok(response)
+    }
+
+    pub(crate) async fn do_tenant_create(
+        &self,
+        create_req: TenantCreateRequest,
+    ) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
        // This service expects to handle sharding itself: it is an error to try and directly create
        // a particular shard here.
        let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
@@ -1013,11 +1188,12 @@ impl Service {
            (waiters, response_shards)
        };

-        self.await_waiters(waiters).await?;
-
-        Ok(TenantCreateResponse {
-            shards: response_shards,
-        })
+        Ok((
+            TenantCreateResponse {
+                shards: response_shards,
+            },
+            waiters,
+        ))
    }

    /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
@@ -1025,8 +1201,9 @@ impl Service {
    async fn await_waiters(
        &self,
        waiters: Vec<ReconcilerWaiter>,
+        timeout: Duration,
    ) -> Result<(), ReconcileWaitError> {
-        let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
+        let deadline = Instant::now().checked_add(timeout).unwrap();
        for waiter in waiters {
            let timeout = deadline.duration_since(Instant::now());
            waiter.wait_timeout(timeout).await?;
@@ -1164,12 +1341,8 @@ impl Service {
            }
        };

-        // TODO: if we timeout/fail on reconcile, we should still succeed this request,
-        // because otherwise a broken compute hook causes a feedback loop where
-        // location_config returns 500 and gets retried forever.
-
-        if let Some(create_req) = maybe_create {
-            let create_resp = self.tenant_create(create_req).await?;
+        let waiters = if let Some(create_req) = maybe_create {
+            let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
            result.shards = create_resp
                .shards
                .into_iter()
@@ -1178,20 +1351,115 @@ impl Service {
                    shard_id: s.shard_id,
                })
                .collect();
+            waiters
        } else {
-            // This was an update, wait for reconciliation
-            if let Err(e) = self.await_waiters(waiters).await {
-                // Do not treat a reconcile error as fatal: we have already applied any requested
-                // Intent changes, and the reconcile can fail for external reasons like unavailable
-                // compute notification API.  In these cases, it is important that we do not
-                // cause the cloud control plane to retry forever on this API.
-                tracing::warn!(
-                    "Failed to reconcile after /location_config: {e}, returning success anyway"
-                );
+            waiters
+        };
+
+        if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
+            // Do not treat a reconcile error as fatal: we have already applied any requested
+            // Intent changes, and the reconcile can fail for external reasons like unavailable
+            // compute notification API.  In these cases, it is important that we do not
+            // cause the cloud control plane to retry forever on this API.
+            tracing::warn!(
+                "Failed to reconcile after /location_config: {e}, returning success anyway"
+            );
+        }
+
+        // Logging the full result is useful because it lets us cross-check what the cloud control
+        // plane's tenant_shards table should contain.
+        tracing::info!("Complete, returning {result:?}");
+
+        Ok(result)
+    }
+
+    pub(crate) async fn tenant_time_travel_remote_storage(
+        &self,
+        time_travel_req: &TenantTimeTravelRequest,
+        tenant_id: TenantId,
+        timestamp: Cow<'_, str>,
+        done_if_after: Cow<'_, str>,
+    ) -> Result<(), ApiError> {
+        let node = {
+            let locked = self.inner.read().unwrap();
+            // Just a sanity check to prevent misuse: the API expects that the tenant is fully
+            // detached everywhere, and nothing writes to S3 storage. Here, we verify that,
+            // but only at the start of the process, so it's really just to prevent operator
+            // mistakes.
+            for (shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+                if shard.intent.get_attached().is_some() || !shard.intent.get_secondary().is_empty()
+                {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                        "We want tenant to be attached in shard with tenant_shard_id={shard_id}"
+                    )));
+                }
+                let maybe_attached = shard
+                    .observed
+                    .locations
+                    .iter()
+                    .filter_map(|(node_id, observed_location)| {
+                        observed_location
+                            .conf
+                            .as_ref()
+                            .map(|loc| (node_id, observed_location, loc.mode))
+                    })
+                    .find(|(_, _, mode)| *mode != LocationConfigMode::Detached);
+                if let Some((node_id, _observed_location, mode)) = maybe_attached {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}")));
+                }
+            }
+            let scheduler = &locked.scheduler;
+            // Right now we only perform the operation on a single node without parallelization
+            // TODO fan out the operation to multiple nodes for better performance
+            let node_id = scheduler.schedule_shard(&[])?;
+            let node = locked
+                .nodes
+                .get(&node_id)
+                .expect("Pageservers may not be deleted while lock is active");
+            node.clone()
+        };
+
+        // The shard count is encoded in the remote storage's URL, so we need to handle all historically used shard counts
+        let mut counts = time_travel_req
+            .shard_counts
+            .iter()
+            .copied()
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect::<Vec<_>>();
+        counts.sort_unstable();
+
+        for count in counts {
+            let shard_ids = (0..count.count())
+                .map(|i| TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(i),
+                    shard_count: count,
+                })
+                .collect::<Vec<_>>();
+            for tenant_shard_id in shard_ids {
+                let client =
+                    mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+
+                tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
+
+                client
+                        .tenant_time_travel_remote_storage(
+                            tenant_shard_id,
+                            &timestamp,
+                            &done_if_after,
+                        )
+                        .await
+                        .map_err(|e| {
+                            ApiError::InternalServerError(anyhow::anyhow!(
+                                "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}",
+                                node.id
+                            ))
+                        })?;
            }
        }

-        Ok(result)
+        Ok(())
    }

    pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
@@ -1287,8 +1555,6 @@ impl Service {
        tenant_id: TenantId,
        mut create_req: TimelineCreateRequest,
    ) -> Result<TimelineInfo, ApiError> {
-        let mut timeline_info = None;
-
        tracing::info!(
            "Creating timeline {}/{}",
            tenant_id,
@@ -1299,7 +1565,7 @@ impl Service {

        // TODO: refuse to do this if shard splitting is in progress
        // (https://github.com/neondatabase/neon/issues/6676)
-        let targets = {
+        let mut targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();

@@ -1323,21 +1589,24 @@ impl Service {
            return Err(ApiError::NotFound(
                anyhow::anyhow!("Tenant not found").into(),
            ));
-        }
-
-        for (tenant_shard_id, node) in targets {
-            // TODO: issue shard timeline creates in parallel, once the 0th is done.
-
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+        };
+        let shard_zero = targets.remove(0);

+        async fn create_one(
+            tenant_shard_id: TenantShardId,
+            node: Node,
+            jwt: Option<String>,
+            create_req: TimelineCreateRequest,
+        ) -> Result<TimelineInfo, ApiError> {
            tracing::info!(
                "Creating timeline on shard {}/{}, attached to node {}",
                tenant_shard_id,
                create_req.new_timeline_id,
                node.id
            );
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());

-            let shard_timeline_info = client
+            client
                .timeline_create(tenant_shard_id, &create_req)
                .await
                .map_err(|e| match e {
@@ -1350,23 +1619,66 @@ impl Service {
                        ApiError::InternalServerError(anyhow::anyhow!(msg))
                    }
                    _ => ApiError::Conflict(format!("Failed to create timeline: {e}")),
-                })?;
-
-            if timeline_info.is_none() {
-                // If the caller specified an ancestor but no ancestor LSN, we are responsible for
-                // propagating the LSN chosen by the first shard to the other shards: it is important
-                // that all shards end up with the same ancestor_start_lsn.
-                if create_req.ancestor_timeline_id.is_some()
-                    && create_req.ancestor_start_lsn.is_none()
-                {
-                    create_req.ancestor_start_lsn = shard_timeline_info.ancestor_lsn;
-                }
-
-                // We will return the TimelineInfo from the first shard
-                timeline_info = Some(shard_timeline_info);
-            }
+                })
        }
-        Ok(timeline_info.expect("targets cannot be empty"))
+
+        // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
+        // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
+        // that will get the first creation request, and propagate the LSN to all the >0 shards.
+        let timeline_info = create_one(
+            shard_zero.0,
+            shard_zero.1,
+            self.config.jwt_token.clone(),
+            create_req.clone(),
+        )
+        .await?;
+
+        // Propagate the LSN that shard zero picked, if caller didn't provide one
+        if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() {
+            create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+        }
+
+        // Create timeline on remaining shards with number >0
+        if !targets.is_empty() {
+            // If we had multiple shards, issue requests for the remainder now.
+            let jwt = self.config.jwt_token.clone();
+            self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                let create_req = create_req.clone();
+                Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
+            })
+            .await?;
+        }
+
+        Ok(timeline_info)
+    }
+
+    /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation.
+    ///
+    /// On success, the returned vector contains exactly the same number of elements as the input `locations`.
+    async fn tenant_for_shards<F, R>(
+        &self,
+        locations: Vec<(TenantShardId, Node)>,
+        mut req_fn: F,
+    ) -> Result<Vec<R>, ApiError>
+    where
+        F: FnMut(
+            TenantShardId,
+            Node,
+        )
+            -> std::pin::Pin<Box<dyn futures::Future<Output = Result<R, ApiError>> + Send>>,
+    {
+        let mut futs = FuturesUnordered::new();
+        let mut results = Vec::with_capacity(locations.len());
+
+        for (tenant_shard_id, node) in locations {
+            futs.push(req_fn(tenant_shard_id, node));
+        }
+
+        while let Some(r) = futs.next().await {
+            results.push(r?);
+        }
+
+        Ok(results)
    }

    pub(crate) async fn tenant_timeline_delete(
@@ -1380,7 +1692,7 @@ impl Service {

        // TODO: refuse to do this if shard splitting is in progress
        // (https://github.com/neondatabase/neon/issues/6676)
-        let targets = {
+        let mut targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();

@@ -1405,12 +1717,14 @@ impl Service {
                anyhow::anyhow!("Tenant not found").into(),
            ));
        }
+        let shard_zero = targets.remove(0);

-        // TODO: call into shards concurrently
-        let mut any_pending = false;
-        for (tenant_shard_id, node) in targets {
-            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
-
+        async fn delete_one(
+            tenant_shard_id: TenantShardId,
+            timeline_id: TimelineId,
+            node: Node,
+            jwt: Option<String>,
+        ) -> Result<StatusCode, ApiError> {
            tracing::info!(
                "Deleting timeline on shard {}/{}, attached to node {}",
                tenant_shard_id,
@@ -1418,7 +1732,8 @@ impl Service {
                node.id
            );

-            let status = client
+            let client = mgmt_api::Client::new(node.base_url(), jwt.as_deref());
+            client
                .timeline_delete(tenant_shard_id, timeline_id)
                .await
                .map_err(|e| {
@@ -1426,18 +1741,36 @@ impl Service {
                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
                    node.id
                ))
-                })?;
-
-            if status == StatusCode::ACCEPTED {
-                any_pending = true;
-            }
+                })
        }

-        if any_pending {
-            Ok(StatusCode::ACCEPTED)
-        } else {
-            Ok(StatusCode::NOT_FOUND)
+        let statuses = self
+            .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                Box::pin(delete_one(
+                    tenant_shard_id,
+                    timeline_id,
+                    node,
+                    self.config.jwt_token.clone(),
+                ))
+            })
+            .await?;
+
+        // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+        if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
+            return Ok(StatusCode::ACCEPTED);
        }
+
+        // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
+        // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
+        let shard_zero_status = delete_one(
+            shard_zero.0,
+            timeline_id,
+            shard_zero.1,
+            self.config.jwt_token.clone(),
+        )
+        .await?;
+
+        Ok(shard_zero_status)
    }

    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
@@ -2009,7 +2342,11 @@ impl Service {
                .context("Scheduler checks")
                .map_err(ApiError::InternalServerError)?;

-            let expect_nodes = locked.nodes.values().cloned().collect::<Vec<_>>();
+            let expect_nodes = locked
+                .nodes
+                .values()
+                .map(|n| n.to_persistent())
+                .collect::<Vec<_>>();

            let expect_shards = locked
                .tenants
@@ -2021,8 +2358,8 @@ impl Service {
        };

        let mut nodes = self.persistence.list_nodes().await?;
-        expect_nodes.sort_by_key(|n| n.id);
-        nodes.sort_by_key(|n| n.id);
+        expect_nodes.sort_by_key(|n| n.node_id);
+        nodes.sort_by_key(|n| n.node_id);

        if nodes != expect_nodes {
            tracing::error!("Consistency check failed on nodes.");
@@ -2036,6 +2373,9 @@ impl Service {
                serde_json::to_string(&nodes)
                    .map_err(|e| ApiError::InternalServerError(e.into()))?
            );
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Node consistency failure"
+            )));
        }

        let mut shards = self.persistence.list_tenant_shards().await?;
@@ -2046,14 +2386,17 @@ impl Service {
            tracing::error!("Consistency check failed on shards.");
            tracing::error!(
                "Shards in memory: {}",
-                serde_json::to_string(&expect_nodes)
+                serde_json::to_string(&expect_shards)
                    .map_err(|e| ApiError::InternalServerError(e.into()))?
            );
            tracing::error!(
                "Shards in database: {}",
-                serde_json::to_string(&nodes)
+                serde_json::to_string(&shards)
                    .map_err(|e| ApiError::InternalServerError(e.into()))?
            );
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Shard consistency failure"
+            )));
        }

        Ok(())
@@ -2179,7 +2522,18 @@ impl Service {
        Ok(())
    }

-    pub(crate) fn node_configure(&self, config_req: NodeConfigureRequest) -> Result<(), ApiError> {
+    pub(crate) async fn node_configure(
+        &self,
+        config_req: NodeConfigureRequest,
+    ) -> Result<(), ApiError> {
+        if let Some(scheduling) = config_req.scheduling {
+            // Scheduling is a persistent part of Node: we must write updates to the database before
+            // applying them in memory
+            self.persistence
+                .update_node(config_req.node_id, scheduling)
+                .await?;
+        }
+
        let mut locked = self.inner.write().unwrap();
        let result_tx = locked.result_tx.clone();
        let compute_hook = locked.compute_hook.clone();
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -143,6 +143,23 @@ impl IntentState {
        }
    }

+    /// Like set_attached, but the node is from [`Self::secondary`].  This swaps the node from
+    /// secondary to attached while maintaining the scheduler's reference counts.
+    pub(crate) fn promote_attached(
+        &mut self,
+        _scheduler: &mut Scheduler,
+        promote_secondary: NodeId,
+    ) {
+        // If we call this with a node that isn't in secondary, it would cause incorrect
+        // scheduler reference counting, since we assume the node is already referenced as a secondary.
+        debug_assert!(self.secondary.contains(&promote_secondary));
+
+        // TODO: when scheduler starts tracking attached + secondary counts separately, we will
+        // need to call into it here.
+        self.secondary.retain(|n| n != &promote_secondary);
+        self.attached = Some(promote_secondary);
+    }
+
    pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) {
        debug_assert!(!self.secondary.contains(&new_secondary));
        scheduler.node_inc_ref(new_secondary);
@@ -197,6 +214,8 @@ impl IntentState {
    /// Returns true if a change was made
    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
        if self.attached == Some(node_id) {
+            // TODO: when scheduler starts tracking attached + secondary counts separately, we will
+            // need to call into it here.
            self.attached = None;
            self.secondary.push(node_id);
            true
@@ -370,6 +389,9 @@ impl TenantState {
        // All remaining observed locations generate secondary intents.  This includes None
        // observations, as these may well have some local content on disk that is usable (this
        // is an edge case that might occur if we restarted during a migration or other change)
+        //
+        // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`]
+        // will take care of promoting one of these secondaries to be attached.
        self.observed.locations.keys().for_each(|node_id| {
            if Some(*node_id) != self.intent.attached {
                self.intent.secondary.push(*node_id);
@@ -377,6 +399,33 @@ impl TenantState {
        });
    }

+    /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the
+    /// attached pageserver for a shard.
+    ///
+    /// Returns whether we modified it, and the NodeId selected.
+    fn schedule_attached(
+        &mut self,
+        scheduler: &mut Scheduler,
+    ) -> Result<(bool, NodeId), ScheduleError> {
+        // No work to do if we already have an attached tenant
+        if let Some(node_id) = self.intent.attached {
+            return Ok((false, node_id));
+        }
+
+        if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) {
+            // Promote a secondary
+            tracing::debug!("Promoted secondary {} to attached", promote_secondary);
+            self.intent.promote_attached(scheduler, promote_secondary);
+            Ok((true, promote_secondary))
+        } else {
+            // Pick a fresh node: either we had no secondaries or none were schedulable
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
+            tracing::debug!("Selected {} as attached", node_id);
+            self.intent.set_attached(scheduler, Some(node_id));
+            Ok((true, node_id))
+        }
+    }
+
    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
@@ -387,19 +436,15 @@ impl TenantState {

        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
-        let mut used_pageservers = self.intent.all_pageservers();
        let mut modified = false;

        use PlacementPolicy::*;
        match self.policy {
            Single => {
                // Should have exactly one attached, and zero secondaries
-                if self.intent.attached.is_none() {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
-                    used_pageservers.push(node_id);
-                    modified = true;
-                }
+                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
+
                if !self.intent.secondary.is_empty() {
                    self.intent.clear_secondary(scheduler);
                    modified = true;
@@ -407,13 +452,10 @@ impl TenantState {
            }
            Double(secondary_count) => {
                // Should have exactly one attached, and N secondaries
-                if self.intent.attached.is_none() {
-                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
-                    self.intent.set_attached(scheduler, Some(node_id));
-                    used_pageservers.push(node_id);
-                    modified = true;
-                }
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;

+                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
                    self.intent.push_secondary(scheduler, node_id);
@@ -495,6 +537,13 @@ impl TenantState {
            }
        }

+        for node_id in self.observed.locations.keys() {
+            if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) {
+                // We have observed state that isn't part of our intent: need to clean it up.
+                return true;
+            }
+        }
+
        // Even if there is no pageserver work to be done, if we have a pending notification to computes,
        // wake up a reconciler to send it.
        if self.pending_compute_notification {
@@ -688,10 +737,95 @@ impl TenantState {
            shard_count: self.tenant_shard_id.shard_count.literal() as i32,
            shard_stripe_size: self.shard.stripe_size.0 as i32,
            generation: self.generation.into().unwrap_or(0) as i32,
-            generation_pageserver: i64::MAX,
+            generation_pageserver: self
+                .intent
+                .get_attached()
+                .map(|n| n.0 as i64)
+                .unwrap_or(i64::MAX),
+
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
        }
    }
 }
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use crate::scheduler::test_utils::make_test_nodes;
+
+    use super::*;
+
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
+        let tenant_id = TenantId::generate();
+        let shard_number = ShardNumber(0);
+        let shard_count = ShardCount::new(1);
+
+        let tenant_shard_id = TenantShardId {
+            tenant_id,
+            shard_number,
+            shard_count,
+        };
+        TenantState::new(
+            tenant_shard_id,
+            ShardIdentity::new(
+                shard_number,
+                shard_count,
+                pageserver_api::shard::ShardStripeSize(32768),
+            )
+            .unwrap(),
+            policy,
+        )
+    }
+
+    /// Test the scheduling behaviors used when a tenant configured for HA is subject
+    /// to nodes being marked offline.
+    #[test]
+    fn tenant_ha_scheduling() -> anyhow::Result<()> {
+        // Start with three nodes.  Our tenant will only use two.  The third one is
+        // expected to remain unused.
+        let mut nodes = make_test_nodes(3);
+
+        let mut scheduler = Scheduler::new(nodes.values());
+
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        tenant_state
+            .schedule(&mut scheduler)
+            .expect("we have enough nodes, scheduling should work");
+
+        // Expect to initially be schedule on to different nodes
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
+        assert!(tenant_state.intent.attached.is_some());
+
+        let attached_node_id = tenant_state.intent.attached.unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
+        assert_ne!(attached_node_id, secondary_node_id);
+
+        // Notifying the attached node is offline should demote it to a secondary
+        let changed = tenant_state.intent.notify_offline(attached_node_id);
+        assert!(changed);
+
+        // Update the scheduler state to indicate the node is offline
+        nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
+        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
+
+        // Scheduling the node should promote the still-available secondary node to attached
+        tenant_state
+            .schedule(&mut scheduler)
+            .expect("active nodes are available");
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
+
+        // The original attached node should have been retained as a secondary
+        assert_eq!(
+            *tenant_state.intent.secondary.iter().last().unwrap(),
+            attached_node_id
+        );
+
+        tenant_state.intent.clear(&mut scheduler);
+
+        Ok(())
+    }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -652,6 +652,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
            let name = import_match
                .get_one::<String>("node-name")
                .ok_or_else(|| anyhow!("No node name provided"))?;
+            let update_catalog = import_match
+                .get_one::<bool>("update-catalog")
+                .cloned()
+                .unwrap_or_default();

            // Parse base inputs
            let base_tarfile = import_match
@@ -694,6 +698,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                None,
                pg_version,
                ComputeMode::Primary,
+                !update_catalog,
            )?;
            println!("Done");
        }
@@ -831,6 +836,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .get_one::<String>("endpoint_id")
                .map(String::to_string)
                .unwrap_or_else(|| format!("ep-{branch_name}"));
+            let update_catalog = sub_args
+                .get_one::<bool>("update-catalog")
+                .cloned()
+                .unwrap_or_default();

            let lsn = sub_args
                .get_one::<String>("lsn")
@@ -880,6 +889,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                http_port,
                pg_version,
                mode,
+                !update_catalog,
            )?;
        }
        "start" => {
@@ -918,6 +928,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .get(endpoint_id.as_str())
                .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;

+            let create_test_user = sub_args
+                .get_one::<bool>("create-test-user")
+                .cloned()
+                .unwrap_or_default();
+
            cplane.check_conflicting_endpoints(
                endpoint.mode,
                endpoint.tenant_id,
@@ -972,6 +987,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                    pageservers,
                    remote_ext_config,
                    stripe_size.0 as usize,
+                    create_test_user,
                )
                .await?;
        }
@@ -1457,6 +1473,18 @@ fn cli() -> Command {
        .required(false)
        .default_value("1");

+    let update_catalog = Arg::new("update-catalog")
+        .value_parser(value_parser!(bool))
+        .long("update-catalog")
+        .help("If set, will set up the catalog for neon_superuser")
+        .required(false);
+
+    let create_test_user = Arg::new("create-test-user")
+        .value_parser(value_parser!(bool))
+        .long("create-test-user")
+        .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
+        .required(false);
+
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1517,6 +1545,7 @@ fn cli() -> Command {
                .arg(Arg::new("end-lsn").long("end-lsn")
                    .help("Lsn the basebackup ends at"))
                .arg(pg_version_arg.clone())
+                .arg(update_catalog.clone())
            )
        ).subcommand(
            Command::new("tenant")
@@ -1630,6 +1659,7 @@ fn cli() -> Command {
                            .required(false))
                    .arg(pg_version_arg.clone())
                    .arg(hot_standby_arg.clone())
+                    .arg(update_catalog)
                )
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1637,6 +1667,7 @@ fn cli() -> Command {
                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
+                    .arg(create_test_user)
                )
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -41,11 +41,15 @@ use std::net::SocketAddr;
 use std::net::TcpStream;
 use std::path::PathBuf;
 use std::process::Command;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::spec::Database;
+use compute_api::spec::PgIdent;
 use compute_api::spec::RemoteExtSpec;
+use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
@@ -122,6 +126,7 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
+        skip_pg_catalog_updates: bool,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -140,7 +145,7 @@ impl ComputeControlPlane {
            // before and after start are the same. So, skip catalog updates,
            // with this we basically test a case of waking up an idle compute, where
            // we also skip catalog updates in the cloud.
-            skip_pg_catalog_updates: true,
+            skip_pg_catalog_updates,
            features: vec![],
        });

@@ -155,7 +160,7 @@ impl ComputeControlPlane {
                http_port,
                pg_port,
                pg_version,
-                skip_pg_catalog_updates: true,
+                skip_pg_catalog_updates,
                features: vec![],
            })?,
        )?;
@@ -500,6 +505,7 @@ impl Endpoint {
        pageservers: Vec<(Host, u16)>,
        remote_ext_config: Option<&String>,
        shard_stripe_size: usize,
+        create_test_user: bool,
    ) -> Result<()> {
        if self.status() == EndpointStatus::Running {
            anyhow::bail!("The endpoint is already running");
@@ -551,8 +557,26 @@ impl Endpoint {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
                state: None,
-                roles: vec![],
-                databases: vec![],
+                roles: if create_test_user {
+                    vec![Role {
+                        name: PgIdent::from_str("test").unwrap(),
+                        encrypted_password: None,
+                        options: None,
+                    }]
+                } else {
+                    Vec::new()
+                },
+                databases: if create_test_user {
+                    vec![Database {
+                        name: PgIdent::from_str("neondb").unwrap(),
+                        owner: PgIdent::from_str("test").unwrap(),
+                        options: None,
+                        restrict_conn: false,
+                        invalid: false,
+                    }]
+                } else {
+                    Vec::new()
+                },
                settings: None,
                postgresql_conf: Some(postgresql_conf),
            },
@@ -566,6 +590,7 @@ impl Endpoint {
            remote_extensions,
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
+            primary_is_running: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -577,11 +602,16 @@ impl Endpoint {
            .open(self.endpoint_path().join("compute.log"))?;

        // Launch compute_ctl
-        println!("Starting postgres node at '{}'", self.connstr());
+        let conn_str = self.connstr("cloud_admin", "postgres");
+        println!("Starting postgres node at '{}'", conn_str);
+        if create_test_user {
+            let conn_str = self.connstr("user", "neondb");
+            println!("Also at '{}'", conn_str);
+        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
        cmd.args(["--http-port", &self.http_address.port().to_string()])
            .args(["--pgdata", self.pgdata().to_str().unwrap()])
-            .args(["--connstr", &self.connstr()])
+            .args(["--connstr", &conn_str])
            .args([
                "--spec-path",
                self.endpoint_path().join("spec.json").to_str().unwrap(),
@@ -652,7 +682,9 @@ impl Endpoint {
                        }
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
+                        | ComputeStatus::Configuration
+                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
                    }
@@ -783,13 +815,13 @@ impl Endpoint {
        Ok(())
    }

-    pub fn connstr(&self) -> String {
+    pub fn connstr(&self, user: &str, db_name: &str) -> String {
        format!(
            "postgresql://{}@{}:{}/{}",
-            "cloud_admin",
+            user,
            self.pg_address.ip(),
            self.pg_address.port(),
-            "postgres"
+            db_name
        )
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -210,6 +210,25 @@ impl PageServerNode {
        update_config: bool,
        register: bool,
    ) -> anyhow::Result<()> {
+        // Register the node with the storage controller before starting pageserver: pageserver must be registered to
+        // successfully call /re-attach and finish starting up.
+        if register {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let (pg_host, pg_port) =
+                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            attachment_service
+                .node_register(NodeRegisterRequest {
+                    node_id: self.conf.id,
+                    listen_pg_addr: pg_host.to_string(),
+                    listen_pg_port: pg_port.unwrap_or(5432),
+                    listen_http_addr: http_host.to_string(),
+                    listen_http_port: http_port.unwrap_or(80),
+                })
+                .await?;
+        }
+
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -248,23 +267,6 @@ impl PageServerNode {
        )
        .await?;

-        if register {
-            let attachment_service = AttachmentService::from_env(&self.env);
-            let (pg_host, pg_port) =
-                parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            attachment_service
-                .node_register(NodeRegisterRequest {
-                    node_id: self.conf.id,
-                    listen_pg_addr: pg_host.to_string(),
-                    listen_pg_port: pg_port.unwrap_or(5432),
-                    listen_http_addr: http_host.to_string(),
-                    listen_http_port: http_port.unwrap_or(80),
-                })
-                .await?;
-        }
-
        Ok(())
    }

@@ -389,11 +391,6 @@ impl PageServerNode {
            evictions_low_residence_duration_metric_threshold: settings
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
-            gc_feedback: settings
-                .remove("gc_feedback")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
            lazy_slru_download: settings
                .remove("lazy_slru_download")
@@ -499,11 +496,6 @@ impl PageServerNode {
                evictions_low_residence_duration_metric_threshold: settings
                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
-                gc_feedback: settings
-                    .remove("gc_feedback")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'gc_feedback' as bool")?,
                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
                lazy_slru_download: settings
                    .remove("lazy_slru_download")
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
    // compute will exit soon or is waiting for
    // control-plane to terminate it.
    Failed,
+    // Termination requested
+    TerminationPending,
+    // Terminated Postgres
+    Terminated,
 }

 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -79,6 +79,12 @@ pub struct ComputeSpec {
    // Stripe size for pageserver sharding, in pages
    #[serde(default)]
    pub shard_stripe_size: Option<usize>,
+
+    // When we are starting a new replica in hot standby mode,
+    // we need to know if the primary is running.
+    // This is used to determine if replica should wait for
+    // RUNNING_XACTS from primary or not.
+    pub primary_is_running: Option<bool>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -201,6 +201,11 @@ impl<P: Atomic> GenericCounterPairVec<P> {
    pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }
+
+    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
+        res[0] = self.inc.remove_label_values(vals);
+        res[1] = self.dec.remove_label_values(vals);
+    }
 }

 impl<P: Atomic> GenericCounterPair<P> {
@@ -247,6 +252,15 @@ impl<P: Atomic> GenericCounterPair<P> {
    }
 }

+impl<P: Atomic> Clone for GenericCounterPair<P> {
+    fn clone(&self) -> Self {
+        Self {
+            inc: self.inc.clone(),
+            dec: self.dec.clone(),
+        }
+    }
+}
+
 /// Guard returned by [`GenericCounterPair::guard`]
 pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);

--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,9 +18,11 @@ enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
+humantime.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
 chrono.workspace = true
+itertools.workspace = true

 workspace_hack.workspace = true

--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -2,6 +2,7 @@ use postgres_ffi::BLCKSZ;
 use std::ops::Range;

 use crate::key::Key;
+use itertools::Itertools;

 ///
 /// Represents a set of Keys, in a compact form.
@@ -63,9 +64,36 @@ impl KeySpace {
        KeyPartitioning { parts }
    }

-    /// Update the keyspace such that it doesn't contain any range
-    /// that is overlapping with `other`. This can involve splitting or
-    /// removing of existing ranges.
+    /// Merge another keyspace into the current one.
+    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    pub fn merge(&mut self, other: &KeySpace) {
+        let all_ranges = self
+            .ranges
+            .iter()
+            .merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start);
+
+        let mut accum = KeySpaceAccum::new();
+        let mut prev: Option<&Range<Key>> = None;
+        for range in all_ranges {
+            if let Some(prev) = prev {
+                let overlap =
+                    std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
+                assert!(
+                    !overlap,
+                    "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
+                    prev, range
+                );
+            }
+
+            accum.add_range(range.clone());
+            prev = Some(range);
+        }
+
+        self.ranges = accum.to_keyspace().ranges;
+    }
+
+    /// Remove all keys in `other` from `self`.
+    /// This can involve splitting or removing of existing ranges.
    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
@@ -220,16 +248,7 @@ impl KeySpaceAccum {
    }

    pub fn consume_keyspace(&mut self) -> KeySpace {
-        if let Some(accum) = self.accum.take() {
-            self.ranges.push(accum);
-        }
-
-        let mut prev_accum = KeySpaceAccum::new();
-        std::mem::swap(self, &mut prev_accum);
-
-        KeySpace {
-            ranges: prev_accum.ranges,
-        }
+        std::mem::take(self).to_keyspace()
    }

    pub fn size(&self) -> u64 {
@@ -279,6 +298,13 @@ impl KeySpaceRandomAccum {
        }
        KeySpace { ranges }
    }
+
+    pub fn consume_keyspace(&mut self) -> KeySpace {
+        let mut prev_accum = KeySpaceRandomAccum::new();
+        std::mem::swap(self, &mut prev_accum);
+
+        prev_accum.to_keyspace()
+    }
 }

 pub fn key_range_size(key_range: &Range<Key>) -> u32 {
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,4 +1,7 @@
 pub mod partitioning;
+pub mod utilization;
+
+pub use utilization::PageserverUtilization;

 use std::{
    collections::HashMap,
@@ -180,7 +183,7 @@ pub enum TimelineState {
    Broken { reason: String, backtrace: String },
 }

-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
    pub new_timeline_id: TimelineId,
    #[serde(default)]
@@ -280,7 +283,6 @@ pub struct TenantConfig {
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
-    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
@@ -291,6 +293,7 @@ pub struct TenantConfig {
 pub enum EvictionPolicy {
    NoEviction,
    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
+    OnlyImitiate(EvictionPolicyLayerAccessThreshold),
 }

 impl EvictionPolicy {
@@ -298,6 +301,7 @@ impl EvictionPolicy {
        match self {
            EvictionPolicy::NoEviction => "NoEviction",
            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
+            EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate",
        }
    }
 }
@@ -335,14 +339,14 @@ impl ThrottleConfig {
    }
    /// The requests per second allowed  by the given config.
    pub fn steady_rps(&self) -> f64 {
-        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
+        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64())
    }
 }

 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
-#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
 pub enum LocationConfigMode {
    AttachedSingle,
    AttachedMulti,
@@ -406,6 +410,12 @@ pub struct TenantLocationConfigRequest {
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantTimeTravelRequest {
+    pub shard_counts: Vec<ShardCount>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantShardLocation {
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -0,0 +1,70 @@
+use std::time::SystemTime;
+
+/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
+/// the next tenant.
+///
+/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth.
+///
+/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
+/// not handle full u64 values properly.
+#[derive(serde::Serialize, Debug)]
+pub struct PageserverUtilization {
+    /// Used disk space
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub disk_usage_bytes: u64,
+    /// Free disk space
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub free_space_bytes: u64,
+    /// Lower is better score for how good candidate for a next tenant would this pageserver be.
+    #[serde(serialize_with = "ser_saturating_u63")]
+    pub utilization_score: u64,
+    /// When was this snapshot captured, pageserver local time.
+    ///
+    /// Use millis to give confidence that the value is regenerated often enough.
+    #[serde(serialize_with = "ser_rfc3339_millis")]
+    pub captured_at: SystemTime,
+}
+
+fn ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &SystemTime,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
+}
+
+/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
+///
+/// Instead of newtype, use this because a newtype would get require handling deserializing values
+/// with the highest bit set which is properly parsed by serde formats, but would create a
+/// conundrum on how to handle and again serialize such values at type level. It will be a few
+/// years until we can use more than `i64::MAX` bytes on a disk.
+fn ser_saturating_u63<S: serde::Serializer>(value: &u64, serializer: S) -> Result<S::Ok, S::Error> {
+    const MAX_FORMAT_INT64: u64 = i64::MAX as u64;
+
+    let value = (*value).min(MAX_FORMAT_INT64);
+
+    serializer.serialize_u64(value)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use super::*;
+
+    #[test]
+    fn u64_max_is_serialized_as_u63_max() {
+        let doc = PageserverUtilization {
+            disk_usage_bytes: u64::MAX,
+            free_space_bytes: 0,
+            utilization_score: u64::MAX,
+            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
+        };
+
+        let s = serde_json::to_string(&doc).unwrap();
+
+        let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
+
+        assert_eq!(s, expected);
+    }
+}
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -80,6 +80,9 @@ pub const XLOG_XACT_ABORT: u8 = 0x20;
 pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
 pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;

+// From standbydefs.h
+pub const XLOG_RUNNING_XACTS: u8 = 0x10;
+
 // From srlu.h
 pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
 pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize;
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -119,11 +119,6 @@ pub fn generate_pg_control(
    // Generate new pg_control needed for bootstrap
    checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;

-    //reset some fields we don't want to preserve
-    //TODO Check this.
-    //We may need to determine the value from twophase data.
-    checkpoint.oldestActiveXid = 0;
-
    //save new values in pg_control
    pg_control.checkPoint = 0;
    pg_control.checkPointCopy = checkpoint;
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -44,6 +44,26 @@ impl DownloadError {
    }
 }

+impl From<std::io::Error> for DownloadError {
+    fn from(value: std::io::Error) -> Self {
+        let needs_unwrap = value.kind() == std::io::ErrorKind::Other
+            && value
+                .get_ref()
+                .and_then(|x| x.downcast_ref::<DownloadError>())
+                .is_some();
+
+        if needs_unwrap {
+            *value
+                .into_inner()
+                .expect("just checked")
+                .downcast::<DownloadError>()
+                .expect("just checked")
+        } else {
+            DownloadError::Other(value.into())
+        }
+    }
+}
+
 #[derive(Debug)]
 pub enum TimeTravelError {
    /// Validation or other error happened due to user input.
@@ -142,13 +162,12 @@ impl std::fmt::Display for TimeoutOrCancel {
 impl std::error::Error for TimeoutOrCancel {}

 impl TimeoutOrCancel {
-    pub fn caused(error: &anyhow::Error) -> Option<&Self> {
-        error.root_cause().downcast_ref()
-    }
-
    /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
    pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
-        Self::caused(error).is_some_and(Self::is_cancel)
+        error
+            .root_cause()
+            .downcast_ref::<Self>()
+            .is_some_and(Self::is_cancel)
    }

    pub fn is_cancel(&self) -> bool {
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -73,6 +73,8 @@ where
        if !*this.hit {
            if let Poll::Ready(e) = this.cancellation.poll(cx) {
                *this.hit = true;
+
+                // most likely this will be a std::io::Error wrapping a DownloadError
                let e = Err(std::io::Error::from(e));
                return Poll::Ready(Some(e));
            }
@@ -130,6 +132,8 @@ mod tests {
                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
            "{inner:?}"
        );
+        let e = DownloadError::from(e);
+        assert!(matches!(e, DownloadError::Cancelled), "{e:?}");

        tokio::select! {
            _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
@@ -146,7 +150,7 @@ mod tests {
        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
        let mut stream = std::pin::pin!(stream);

-        // because the stream uses 120s timeout we are paused, we advance to 120s right away.
+        // because the stream uses 120s timeout and we are paused, we advance to 120s right away.
        let first = stream.next();

        let e = first.await.expect("there must be some").unwrap_err();
@@ -158,6 +162,8 @@ mod tests {
                .is_some_and(|e| matches!(e, DownloadError::Timeout)),
            "{inner:?}"
        );
+        let e = DownloadError::from(e);
+        assert!(matches!(e, DownloadError::Timeout), "{e:?}");

        cancel.cancel();

--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::{
    borrow::Cow,
    fs::{self, File},
-    io,
+    io::{self, Write},
 };

 use camino::{Utf8Path, Utf8PathBuf};
@@ -161,6 +161,48 @@ pub async fn durable_rename(
    Ok(())
 }

+/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`].
+///
+/// The file is first written to the specified `tmp_path`, and in a second
+/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync
+/// and atomic rename guarantee that, if we crash at any point, there will never
+/// be a partially written file at `final_path` (but maybe at `tmp_path`).
+///
+/// Callers are responsible for serializing calls of this function for a given `final_path`.
+/// If they don't, there may be an error due to conflicting `tmp_path`, or there will
+/// be no error and the content of `final_path` will be the "winner" caller's `content`.
+/// I.e., the atomticity guarantees still hold.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // don't keep the fd open for longer than we have to
+
+    std::fs::rename(tmp_path, final_path)?;
+
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -217,6 +217,20 @@ impl Client {
        }
    }

+    pub async fn tenant_time_travel_remote_storage(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timestamp: &str,
+        done_if_after: &str,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}",
+            self.mgmt_api_endpoint
+        );
+        self.request(Method::PUT, &uri, ()).await?;
+        Ok(())
+    }
+
    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -261,10 +261,7 @@ where
            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);

            for part in slru_partitions.parts {
-                let blocks = self
-                    .timeline
-                    .get_vectored(&part.ranges, self.lsn, self.ctx)
-                    .await?;
+                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;

                for (key, block) in blocks {
                    slru_builder.add_block(&key, block?).await?;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,12 +33,13 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::virtual_file;
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

@@ -84,6 +85,8 @@ pub mod defaults {

    pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";

+    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -121,6 +124,8 @@ pub mod defaults {

 #virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}'

+#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -135,7 +140,6 @@ pub mod defaults {

 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-#gc_feedback = false

 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
 #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
@@ -256,6 +260,8 @@ pub struct PageServerConf {
    pub ingest_batch_size: u64,

    pub virtual_file_io_engine: virtual_file::IoEngineKind,
+
+    pub get_vectored_impl: GetVectoredImpl,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -342,6 +348,8 @@ struct PageServerConfigBuilder {
    ingest_batch_size: BuilderValue<u64>,

    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,
+
+    get_vectored_impl: BuilderValue<GetVectoredImpl>,
 }

 impl Default for PageServerConfigBuilder {
@@ -419,6 +427,8 @@ impl Default for PageServerConfigBuilder {
            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),

            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),
+
+            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
        }
    }
 }
@@ -579,6 +589,10 @@ impl PageServerConfigBuilder {
        self.virtual_file_io_engine = BuilderValue::Set(value);
    }

+    pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) {
+        self.get_vectored_impl = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -689,6 +703,9 @@ impl PageServerConfigBuilder {
            virtual_file_io_engine: self
                .virtual_file_io_engine
                .ok_or(anyhow!("missing virtual_file_io_engine"))?,
+            get_vectored_impl: self
+                .get_vectored_impl
+                .ok_or(anyhow!("missing get_vectored_impl"))?,
        })
    }
 }
@@ -808,17 +825,6 @@ impl PageServerConf {
            .join(connection_id.to_string())
    }

-    /// Points to a place in pageserver's local directory,
-    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-    ) -> Utf8PathBuf {
-        self.timeline_path(tenant_shard_id, timeline_id)
-            .join(METADATA_FILE_NAME)
-    }
-
    /// Turns storage remote path of a file into its local path.
    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
@@ -943,6 +949,9 @@ impl PageServerConf {
                "virtual_file_io_engine" => {
                    builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?)
                }
+                "get_vectored_impl" => {
+                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1017,6 +1026,7 @@ impl PageServerConf {
            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
        }
    }
 }
@@ -1250,6 +1260,7 @@ background_task_maximum_delay = '334 s'
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1314,6 +1325,7 @@ background_task_maximum_delay = '334 s'
                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
+                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1548,17 +1560,50 @@ threshold = "20m"
                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
            })
        );
+
        match &conf.default_tenant_conf.eviction_policy {
-            EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"),
-            EvictionPolicy::LayerAccessThreshold(eviction_thresold) => {
-                assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60));
-                assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60));
+            EvictionPolicy::LayerAccessThreshold(eviction_threshold) => {
+                assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60));
+                assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60));
            }
+            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
        }

        Ok(())
    }

+    #[test]
+    fn parse_imitation_only_pageserver_config() {
+        let tempdir = tempdir().unwrap();
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap();
+
+        let pageserver_conf_toml = format!(
+            r#"pg_distrib_dir = "{pg_distrib_dir}"
+metric_collection_endpoint = "http://sample.url"
+metric_collection_interval = "10min"
+id = 222
+
+[tenant_config]
+evictions_low_residence_duration_metric_threshold = "20m"
+
+[tenant_config.eviction_policy]
+kind = "OnlyImitiate"
+period = "20m"
+threshold = "20m"
+"#,
+        );
+        let toml: Document = pageserver_conf_toml.parse().unwrap();
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
+
+        match &conf.default_tenant_conf.eviction_policy {
+            EvictionPolicy::OnlyImitiate(t) => {
+                assert_eq!(t.period, Duration::from_secs(20 * 60));
+                assert_eq!(t.threshold, Duration::from_secs(20 * 60));
+            }
+            other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"),
+        }
+    }
+
    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
        let tempdir_path = tempdir.path();

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -234,7 +234,7 @@ impl DeletionHeader {
        let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
        let header_path = conf.deletion_header_path();
        let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
-        VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
+        VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes)
            .await
            .maybe_fatal_err("save deletion header")?;

@@ -325,7 +325,8 @@ impl DeletionList {
        let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);

        let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
-        VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
+
+        VirtualFile::crashsafe_overwrite(path, temp_path, bytes)
            .await
            .maybe_fatal_err("save deletion list")
            .map_err(Into::into)
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -567,114 +567,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"
-
-  /v1/tenant/{tenant_id}/attach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    post:
-      description: |
-        Schedules attach operation to happen in the background for the given tenant.
-        As soon as the caller sends this request, it must assume the pageserver
-        starts writing to the tenant's S3 state unless it receives one of the
-        distinguished errors below that state otherwise.
-
-        If a client receives a not-distinguished response, e.g., a network timeout,
-        it MUST retry the /attach request and poll again for the tenant's
-        attachment status.
-
-        After the client has received a 202, it MUST poll the tenant's
-        attachment status (field `attachment_status`) to reach state `attached`.
-        If the `attachment_status` is missing, the client MUST retry the `/attach`
-        request (goto previous paragraph). This is a robustness measure in case the tenant
-        status endpoint is buggy, but the attach operation is ongoing.
-
-        There is no way to cancel an in-flight request.
-
-        In any case, the client
-        * MUST NOT ASSUME that the /attach request has been lost in the network,
-        * MUST NOT ASSUME that the request has been lost, based on the observation
-          that a subsequent tenant status request returns 404. The request may
-          still be in flight. It must be retried.
-
-        The client SHOULD supply a `TenantConfig` for the tenant in the request body.
-        Settings specified in the config override the pageserver's defaults.
-        It is guaranteed that the config settings are applied before the pageserver
-        starts operating on the tenant. E.g., if the config specifies a specific
-        PITR interval for a tenant, then that setting will be in effect before the
-        pageserver starts the garbage collection loop. This enables a client to
-        guarantee a specific PITR setting across detach/attach cycles.
-        The pageserver will reject the request if it cannot parse the config, or
-        if there are any unknown fields in it.
-
-        If the client does not supply a config, the pageserver will use its defaults.
-        This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282
-      requestBody:
-        required: false
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TenantAttachRequest"
-      responses:
-        "202":
-          description: Tenant attaching scheduled
-        "400":
-          description: Bad Request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: |
-            The tenant is already known to Pageserver in some way,
-            and hence this `/attach` call has been rejected.
-
-            Some examples of how this can happen:
-            - tenant was created on this pageserver
-            - tenant attachment was started by an earlier call to `/attach`.
-
-            Callers should poll the tenant status's `attachment_status` field,
-            like for status 202. See the longer description for `POST /attach`
-            for details.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
  /v1/tenant/{tenant_id}/location_config:
    parameters:
      - name: tenant_id
@@ -770,66 +662,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-
-  /v1/tenant/{tenant_id}/detach:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: detach_ignored
-        in: query
-        required: false
-        schema:
-          type: boolean
-        description: |
-          When true, allow to detach a tenant which state is ignored.
-    post:
-      description: |
-        Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
-        Files on the remote storage are not affected.
-      responses:
-        "200":
-          description: Tenant detached
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenant not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-
  /v1/tenant/{tenant_id}/ignore:
    parameters:
      - name: tenant_id
@@ -1379,6 +1211,25 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"

+  /v1/utilization:
+    get:
+      description: |
+        Returns the pageservers current utilization and fitness score for new tenants.
+
+      responses:
+        "200":
+            description: Pageserver utilization and fitness score
+            content:
+              application/json:
+                schema:
+                  $ref: "#/components/schemas/PageserverUtilization"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
 components:
  securitySchemes:
    JWT:
@@ -1445,16 +1296,6 @@ components:
        generation:
          type: integer
          description: Attachment generation number.
-    TenantAttachRequest:
-      type: object
-      required:
-        - config
-      properties:
-        config:
-          $ref: '#/components/schemas/TenantConfig'
-        generation:
-          type: integer
-          description: Attachment generation number.
    TenantConfigRequest:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
@@ -1691,6 +1532,33 @@ components:
          type: string
          enum: [past, present, future, nodata]

+    PageserverUtilization:
+      type: object
+      required:
+        - disk_usage_bytes
+        - free_space_bytes
+        - utilization_score
+      properties:
+        disk_usage_bytes:
+          type: integer
+          format: int64
+          minimum: 0
+          description: The amount of disk space currently utilized by layer files.
+        free_space_bytes:
+          type: integer
+          format: int64
+          minimum: 0
+          description: The amount of usable disk space left.
+        utilization_score:
+          type: integer
+          format: int64
+          minimum: 0
+          maximum: 9223372036854775807
+          default: 9223372036854775807
+          description: |
+            Lower is better score for how good this pageserver would be for the next tenant.
+            The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -100,6 +100,7 @@ pub struct State {
    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    deletion_queue_client: DeletionQueueClient,
    secondary_controller: SecondaryController,
+    latest_utilization: tokio::sync::Mutex<Option<(std::time::Instant, bytes::Bytes)>>,
 }

 impl State {
@@ -128,6 +129,7 @@ impl State {
            disk_usage_eviction_state,
            deletion_queue_client,
            secondary_controller,
+            latest_utilization: Default::default(),
        })
    }
 }
@@ -1963,6 +1965,54 @@ async fn put_io_engine_handler(
    json_response(StatusCode::OK, ())
 }

+/// Polled by control plane.
+///
+/// See [`crate::utilization`].
+async fn get_utilization(
+    r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // this probably could be completely public, but lets make that change later.
+    check_permission(&r, None)?;
+
+    let state = get_state(&r);
+    let mut g = state.latest_utilization.lock().await;
+
+    let regenerate_every = Duration::from_secs(1);
+    let still_valid = g
+        .as_ref()
+        .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every);
+
+    // avoid needless statvfs calls even though those should be non-blocking fast.
+    // regenerate at most 1Hz to allow polling at any rate.
+    if !still_valid {
+        let path = state.conf.tenants_path();
+        let doc = crate::utilization::regenerate(path.as_std_path())
+            .map_err(ApiError::InternalServerError)?;
+
+        let mut buf = Vec::new();
+        serde_json::to_writer(&mut buf, &doc)
+            .context("serialize")
+            .map_err(ApiError::InternalServerError)?;
+
+        let body = bytes::Bytes::from(buf);
+
+        *g = Some((std::time::Instant::now(), body));
+    }
+
+    // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork
+    let cached = g.as_ref().expect("just set").1.clone();
+
+    Response::builder()
+        .header(hyper::http::header::CONTENT_TYPE, "application/json")
+        // thought of using http date header, but that is second precision which does not give any
+        // debugging aid
+        .status(StatusCode::OK)
+        .body(hyper::Body::from(cached))
+        .context("build response")
+        .map_err(ApiError::InternalServerError)
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2224,5 +2274,6 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
+        .get("/v1/utilization", |r| api_handler(r, get_utilization))
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -22,6 +22,7 @@ pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
 pub mod trace;
+pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
 pub mod walrecord;
@@ -168,15 +169,6 @@ pub fn is_delete_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }

-fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
-    if let Some(e) = e.io_error() {
-        if e.kind() == std::io::ErrorKind::NotFound {
-            return true;
-        }
-    }
-    false
-}
-
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -4,8 +4,8 @@ use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
@@ -642,26 +642,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });

-// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
-// or in testing they estimate how much we would upload if we did.
-static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_created_persistent_files_total",
-        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
-static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_written_persistent_bytes_total",
-        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_eviction_iteration_duration_seconds_global",
@@ -1266,13 +1246,12 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {

 // remote storage metrics

-/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`].
-static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_remote_timeline_client_calls_unfinished",
-        "Number of ongoing calls to remote timeline client. \
-         Used to populate pageserver_remote_timeline_client_calls_started. \
-         This metric is not useful for sampling from Prometheus, but useful in tests.",
+static REMOTE_TIMELINE_CLIENT_CALLS: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_remote_timeline_client_calls_started",
+        "Number of started calls to remote timeline client.",
+        "pageserver_remote_timeline_client_calls_finished",
+        "Number of finshed calls to remote timeline client.",
        &[
            "tenant_id",
            "shard_id",
@@ -1281,23 +1260,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
            "op_kind"
        ],
    )
-    .expect("failed to define a metric")
-});
-
-static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_remote_timeline_client_calls_started",
-        "When calling a remote timeline client method, we record the current value \
-         of the calls_unfinished gauge in this histogram. Plot the histogram \
-         over time in a heatmap to visualize how many operations were ongoing \
-         at a given instant. It gives you a better idea of the queue depth \
-         than plotting the gauge directly, since operations may complete faster \
-         than the sampling interval.",
-        &["file_kind", "op_kind"],
-        // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
-        vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
-    )
-    .expect("failed to define a metric")
+    .unwrap()
 });

 static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
@@ -1819,8 +1782,6 @@ pub(crate) struct TimelineMetrics {
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
-    pub num_persistent_files_created: IntCounter,
-    pub persistent_bytes_written: IntCounter,
    pub evictions: IntCounter,
    pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
 }
@@ -1902,12 +1863,6 @@ impl TimelineMetrics {
        };
        let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
            Lazy::new(Box::new(directory_entries_count_gauge_closure));
-        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -1929,8 +1884,6 @@ impl TimelineMetrics {
            resident_physical_size_gauge,
            current_logical_size_gauge,
            directory_entries_count_gauge,
-            num_persistent_files_created,
-            persistent_bytes_written,
            evictions,
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
@@ -1940,8 +1893,6 @@ impl TimelineMetrics {

    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
-        self.num_persistent_files_created.inc_by(1);
-        self.persistent_bytes_written.inc_by(sz);
    }

    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
@@ -1974,9 +1925,6 @@ impl Drop for TimelineMetrics {
        if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
            let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ =
-            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
@@ -2078,7 +2026,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
    shard_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
-    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
+    calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -2089,7 +2037,7 @@ impl RemoteTimelineClientMetrics {
            tenant_id: tenant_shard_id.tenant_id.to_string(),
            shard_id: format!("{}", tenant_shard_id.shard_slug()),
            timeline_id: timeline_id.to_string(),
-            calls_unfinished_gauge: Mutex::new(HashMap::default()),
+            calls: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
            bytes_finished_counter: Mutex::new(HashMap::default()),
            remote_physical_size_gauge: Mutex::new(None),
@@ -2129,15 +2077,15 @@ impl RemoteTimelineClientMetrics {
            .unwrap()
    }

-    fn calls_unfinished_gauge(
+    fn calls_counter_pair(
        &self,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
-    ) -> IntGauge {
-        let mut guard = self.calls_unfinished_gauge.lock().unwrap();
+    ) -> IntCounterPair {
+        let mut guard = self.calls.lock().unwrap();
        let key = (file_kind.as_str(), op_kind.as_str());
        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
+            REMOTE_TIMELINE_CLIENT_CALLS
                .get_metric_with_label_values(&[
                    &self.tenant_id,
                    &self.shard_id,
@@ -2150,17 +2098,6 @@ impl RemoteTimelineClientMetrics {
        metric.clone()
    }

-    fn calls_started_hist(
-        &self,
-        file_kind: &RemoteOpFileKind,
-        op_kind: &RemoteOpKind,
-    ) -> Histogram {
-        let key = (file_kind.as_str(), op_kind.as_str());
-        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-            .get_metric_with_label_values(&[key.0, key.1])
-            .unwrap()
-    }
-
    fn bytes_started_counter(
        &self,
        file_kind: &RemoteOpFileKind,
@@ -2231,7 +2168,7 @@ impl RemoteTimelineClientMetrics {
 #[must_use]
 pub(crate) struct RemoteTimelineClientCallMetricGuard {
    /// Decremented on drop.
-    calls_unfinished_metric: Option<IntGauge>,
+    calls_counter_pair: Option<IntCounterPair>,
    /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop.
    bytes_finished: Option<(IntCounter, u64)>,
 }
@@ -2241,10 +2178,10 @@ impl RemoteTimelineClientCallMetricGuard {
    /// The caller vouches to do the metric updates manually.
    pub fn will_decrement_manually(mut self) {
        let RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric,
+            calls_counter_pair,
            bytes_finished,
        } = &mut self;
-        calls_unfinished_metric.take();
+        calls_counter_pair.take();
        bytes_finished.take();
    }
 }
@@ -2252,10 +2189,10 @@ impl RemoteTimelineClientCallMetricGuard {
 impl Drop for RemoteTimelineClientCallMetricGuard {
    fn drop(&mut self) {
        let RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric,
+            calls_counter_pair,
            bytes_finished,
        } = self;
-        if let Some(guard) = calls_unfinished_metric.take() {
+        if let Some(guard) = calls_counter_pair.take() {
            guard.dec();
        }
        if let Some((bytes_finished_metric, value)) = bytes_finished {
@@ -2288,10 +2225,8 @@ impl RemoteTimelineClientMetrics {
        op_kind: &RemoteOpKind,
        size: RemoteTimelineClientMetricsCallTrackSize,
    ) -> RemoteTimelineClientCallMetricGuard {
-        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
-        self.calls_started_hist(file_kind, op_kind)
-            .observe(calls_unfinished_metric.get() as f64);
-        calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric
+        let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
+        calls_counter_pair.inc();

        let bytes_finished = match size {
            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {
@@ -2305,7 +2240,7 @@ impl RemoteTimelineClientMetrics {
            }
        };
        RemoteTimelineClientCallMetricGuard {
-            calls_unfinished_metric: Some(calls_unfinished_metric),
+            calls_counter_pair: Some(calls_counter_pair),
            bytes_finished,
        }
    }
@@ -2319,12 +2254,8 @@ impl RemoteTimelineClientMetrics {
        op_kind: &RemoteOpKind,
        size: RemoteTimelineClientMetricsCallTrackSize,
    ) {
-        let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind);
-        debug_assert!(
-            calls_unfinished_metric.get() > 0,
-            "begin and end should cancel out"
-        );
-        calls_unfinished_metric.dec();
+        let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind);
+        calls_counter_pair.dec();
        match size {
            RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {}
            RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => {
@@ -2341,18 +2272,15 @@ impl Drop for RemoteTimelineClientMetrics {
            shard_id,
            timeline_id,
            remote_physical_size_gauge,
-            calls_unfinished_gauge,
+            calls,
            bytes_started_counter,
            bytes_finished_counter,
        } = self;
-        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
-            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
-                tenant_id,
-                shard_id,
-                timeline_id,
-                a,
-                b,
-            ]);
+        for ((a, b), _) in calls.get_mut().unwrap().drain() {
+            let mut res = [Ok(()), Ok(())];
+            REMOTE_TIMELINE_CLIENT_CALLS
+                .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]);
+            // don't care about results
        }
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -29,7 +29,6 @@ use remote_storage::TimeoutOrCancel;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
-use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -172,9 +171,6 @@ pub(crate) mod throttle;
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

-// re-export for use in remote_timeline_client.rs
-pub use crate::tenant::metadata::save_metadata;
-
 // re-export for use in walreceiver
 pub use crate::tenant::timeline::WalReceiverInfo;

@@ -1151,17 +1147,6 @@ impl Tenant {
            None
        };

-        // timeline loading after attach expects to find metadata file for each metadata
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            &timeline_id,
-            &remote_metadata,
-        )
-        .await
-        .context("save_metadata")
-        .map_err(LoadLocalTimelineError::Load)?;
-
        self.timeline_init_and_sync(
            timeline_id,
            resources,
@@ -2588,19 +2573,24 @@ impl Tenant {
        legacy_config_path: &Utf8Path,
        location_conf: &LocationConf,
    ) -> anyhow::Result<()> {
-        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
-        Self::persist_tenant_config_legacy(
-            tenant_shard_id,
-            legacy_config_path,
-            &location_conf.tenant_conf,
-        )
-        .await?;
-
        if let LocationMode::Attached(attach_conf) = &location_conf.mode {
-            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
-            // then drop out after writing legacy-style config.
+            // The modern-style LocationConf config file requires a generation to be set. In case someone
+            // is running a pageserver without the infrastructure to set generations, write out the legacy-style
+            // config file that only contains TenantConf.
+            //
+            // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388
+
            if attach_conf.generation.is_none() {
-                tracing::debug!("Running without generations, not writing new-style LocationConf");
+                tracing::info!(
+                    "Running without generations, writing legacy-style tenant config file"
+                );
+                Self::persist_tenant_config_legacy(
+                    tenant_shard_id,
+                    legacy_config_path,
+                    &location_conf.tenant_conf,
+                )
+                .await?;
+
                return Ok(());
            }
        }
@@ -2623,17 +2613,10 @@ impl Tenant {

        let tenant_shard_id = *tenant_shard_id;
        let config_path = config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;

        Ok(())
    }
@@ -2660,17 +2643,12 @@ impl Tenant {

        let tenant_shard_id = *tenant_shard_id;
        let target_config_path = target_config_path.to_owned();
-        tokio::task::spawn_blocking(move || {
-            Handle::current().block_on(async move {
-                let conf_content = conf_content.into_bytes();
-                VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
-                    .await
-                    .with_context(|| {
-                        format!("write tenant {tenant_shard_id} config to {target_config_path}")
-                    })
-            })
-        })
-        .await??;
+        let conf_content = conf_content.into_bytes();
+        VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content)
+            .await
+            .with_context(|| {
+                format!("write tenant {tenant_shard_id} config to {target_config_path}")
+            })?;
        Ok(())
    }

@@ -3293,10 +3271,7 @@ impl Tenant {

        timeline_struct.init_empty_layer_map(start_lsn);

-        if let Err(e) = self
-            .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata)
-            .await
-        {
+        if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
            error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
            cleanup_timeline_directory(uninit_mark);
            return Err(e);
@@ -3313,26 +3288,13 @@ impl Tenant {
        ))
    }

-    async fn create_timeline_files(
-        &self,
-        timeline_path: &Utf8Path,
-        new_timeline_id: &TimelineId,
-        new_metadata: &TimelineMetadata,
-    ) -> anyhow::Result<()> {
+    async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
        crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;

        fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
            anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
        });

-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            new_timeline_id,
-            new_metadata,
-        )
-        .await
-        .context("Failed to create timeline metadata")?;
        Ok(())
    }

@@ -3684,7 +3646,6 @@ pub(crate) mod harness {
                evictions_low_residence_duration_metric_threshold: Some(
                    tenant_conf.evictions_low_residence_duration_metric_threshold,
                ),
-                gc_feedback: Some(tenant_conf.gc_feedback),
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
@@ -3877,6 +3838,7 @@ mod tests {
    use bytes::BytesMut;
    use hex_literal::hex;
    use once_cell::sync::Lazy;
+    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
    use tokio_util::sync::CancellationToken;

@@ -4514,6 +4476,61 @@ mod tests {
        Ok(())
    }

+    async fn bulk_insert_compact_gc(
+        timeline: Arc<Timeline>,
+        ctx: &RequestContext,
+        mut lsn: Lsn,
+        repeat: usize,
+        key_count: usize,
+    ) -> anyhow::Result<()> {
+        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut blknum = 0;
+
+        // Enforce that key range is monotonously increasing
+        let mut keyspace = KeySpaceAccum::new();
+
+        for _ in 0..repeat {
+            for _ in 0..key_count {
+                test_key.field6 = blknum;
+                let mut writer = timeline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+
+                keyspace.add_key(test_key);
+
+                lsn = Lsn(lsn.0 + 0x10);
+                blknum += 1;
+            }
+
+            let cutoff = timeline.get_last_record_lsn();
+
+            timeline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    ctx,
+                )
+                .await?;
+            timeline.freeze_and_flush().await?;
+            timeline
+                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                .await?;
+            timeline.gc().await?;
+        }
+
+        Ok(())
+    }
+
    //
    // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
    // Repeat 50 times.
@@ -4526,49 +4543,98 @@ mod tests {
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

-        let mut lsn = Lsn(0x10);
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

-        let mut keyspace = KeySpaceAccum::new();
+        Ok(())
+    }

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
-        let mut blknum = 0;
-        for _ in 0..50 {
-            for _ in 0..10000 {
-                test_key.field6 = blknum;
-                let mut writer = tline.writer().await;
-                writer
-                    .put(
-                        test_key,
-                        lsn,
-                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
-                        &ctx,
-                    )
-                    .await?;
-                writer.finish_write(lsn);
-                drop(writer);
+    // Test the vectored get real implementation against a simple sequential implementation.
+    //
+    // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting.
+    // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys
+    // grow to the right on the X axis.
+    //                       [Delta]
+    //                 [Delta]
+    //           [Delta]
+    //    [Delta]
+    // ------------ Image ---------------
+    //
+    // After layer generation we pick the ranges to query as follows:
+    // 1. The beginning of each delta layer
+    // 2. At the seam between two adjacent delta layers
+    //
+    // There's one major downside to this test: delta layers only contains images,
+    // so the search can stop at the first delta layer and doesn't traverse any deeper.
+    #[tokio::test]
+    async fn test_get_vectored() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
+            .await?;

-                keyspace.add_key(test_key);
+        let lsn = Lsn(0x10);
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

-                lsn = Lsn(lsn.0 + 0x10);
-                blknum += 1;
+        let guard = tline.layers.read().await;
+        guard.layer_map().dump(true, &ctx).await?;
+
+        let mut reads = Vec::new();
+        let mut prev = None;
+        guard.layer_map().iter_historic_layers().for_each(|desc| {
+            if !desc.is_delta() {
+                prev = Some(desc.clone());
+                return;
            }

-            let cutoff = tline.get_last_record_lsn();
+            let start = desc.key_range.start;
+            let end = desc
+                .key_range
+                .start
+                .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
+            reads.push(KeySpace {
+                ranges: vec![start..end],
+            });

+            if let Some(prev) = &prev {
+                if !prev.is_delta() {
+                    return;
+                }
+
+                let first_range = Key {
+                    field6: prev.key_range.end.field6 - 4,
+                    ..prev.key_range.end
+                }..prev.key_range.end;
+
+                let second_range = desc.key_range.start..Key {
+                    field6: desc.key_range.start.field6 + 4,
+                    ..desc.key_range.start
+                };
+
+                reads.push(KeySpace {
+                    ranges: vec![first_range, second_range],
+                });
+            };
+
+            prev = Some(desc.clone());
+        });
+
+        drop(guard);
+
+        // Pick a big LSN such that we query over all the changes.
+        // Technically, u64::MAX - 1 is the largest LSN supported by the read path,
+        // but there seems to be a bug on the non-vectored search path which surfaces
+        // in that case.
+        let reads_lsn = Lsn(u64::MAX - 1000);
+
+        for read in reads {
+            info!("Doing vectored read on {:?}", read);
+
+            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
            tline
-                .update_gc_info(
-                    Vec::new(),
-                    cutoff,
-                    Duration::ZERO,
-                    &CancellationToken::new(),
-                    &ctx,
-                )
-                .await?;
-            tline.freeze_and_flush().await?;
-            tline
-                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
-                .await?;
-            tline.gc().await?;
+                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
+                .await;
        }

        Ok(())
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -339,7 +339,6 @@ pub struct TenantConf {
    // See the corresponding metric's help string.
    #[serde(with = "humantime_serde")]
    pub evictions_low_residence_duration_metric_threshold: Duration,
-    pub gc_feedback: bool,

    /// If non-zero, the period between uploads of a heatmap from attached tenants.  This
    /// may be disabled if a Tenant will not have secondary locations: only secondary
@@ -427,10 +426,6 @@ pub struct TenantConfOpt {
    #[serde(default)]
    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub gc_feedback: Option<bool>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    #[serde(default)]
@@ -485,7 +480,6 @@ impl TenantConfOpt {
            evictions_low_residence_duration_metric_threshold: self
                .evictions_low_residence_duration_metric_threshold
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
-            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
            heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
            lazy_slru_download: self
                .lazy_slru_download
@@ -530,7 +524,6 @@ impl Default for TenantConf {
                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
            )
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
-            gc_feedback: false,
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
@@ -603,7 +596,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            evictions_low_residence_duration_metric_threshold: value
                .evictions_low_residence_duration_metric_threshold
                .map(humantime),
-            gc_feedback: value.gc_feedback,
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -52,8 +52,7 @@ use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
 use pageserver_api::keyspace::KeySpaceAccum;
-use std::cmp::Ordering;
-use std::collections::{BTreeMap, VecDeque};
+use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
@@ -147,43 +146,28 @@ impl Drop for BatchedUpdates<'_> {
 }

 /// Return value of LayerMap::search
-#[derive(Eq, PartialEq, Debug)]
+#[derive(Eq, PartialEq, Debug, Hash)]
 pub struct SearchResult {
    pub layer: Arc<PersistentLayerDesc>,
    pub lsn_floor: Lsn,
 }

-pub struct OrderedSearchResult(SearchResult);
-
-impl Ord for OrderedSearchResult {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.0.lsn_floor.cmp(&other.0.lsn_floor)
-    }
-}
-
-impl PartialOrd for OrderedSearchResult {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl PartialEq for OrderedSearchResult {
-    fn eq(&self, other: &Self) -> bool {
-        self.0.lsn_floor == other.0.lsn_floor
-    }
-}
-
-impl Eq for OrderedSearchResult {}
-
+/// Return value of [`LayerMap::range_search`]
+///
+/// Contains a mapping from a layer description to a keyspace
+/// accumulator that contains all the keys which intersect the layer
+/// from the original search space. Keys that were not found are accumulated
+/// in a separate key space accumulator.
+#[derive(Debug)]
 pub struct RangeSearchResult {
-    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub found: HashMap<SearchResult, KeySpaceAccum>,
    pub not_found: KeySpaceAccum,
 }

 impl RangeSearchResult {
    fn new() -> Self {
        Self {
-            found: BTreeMap::new(),
+            found: HashMap::new(),
            not_found: KeySpaceAccum::new(),
        }
    }
@@ -314,7 +298,7 @@ where
            Some(search_result) => self
                .result
                .found
-                .entry(OrderedSearchResult(search_result))
+                .entry(search_result)
                .or_default()
                .add_range(covered_range),
            None => self.pad_range(covered_range),
@@ -362,6 +346,35 @@ where
    }
 }

+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub enum InMemoryLayerHandle {
+    Open {
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+    Frozen {
+        idx: usize,
+        lsn_floor: Lsn,
+        end_lsn: Lsn,
+    },
+}
+
+impl InMemoryLayerHandle {
+    pub fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor,
+            InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor,
+        }
+    }
+
+    pub fn get_end_lsn(&self) -> Lsn {
+        match self {
+            InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn,
+            InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn,
+        }
+    }
+}
+
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -556,6 +569,43 @@ impl LayerMap {
        self.historic.iter()
    }

+    /// Get a handle for the first in memory layer that matches the provided predicate.
+    /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer.
+    ///
+    /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during
+    /// the same exclusive region established by holding the layer manager lock.
+    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<InMemoryLayerHandle>
+    where
+        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
+    {
+        if let Some(open) = &self.open_layer {
+            if pred(open) {
+                return Some(InMemoryLayerHandle::Open {
+                    lsn_floor: open.get_lsn_range().start,
+                    end_lsn: open.get_lsn_range().end,
+                });
+            }
+        }
+
+        let pos = self.frozen_layers.iter().rev().position(pred);
+        pos.map(|rev_idx| {
+            let idx = self.frozen_layers.len() - 1 - rev_idx;
+            InMemoryLayerHandle::Frozen {
+                idx,
+                lsn_floor: self.frozen_layers[idx].get_lsn_range().start,
+                end_lsn: self.frozen_layers[idx].get_lsn_range().end,
+            }
+        })
+    }
+
+    /// Get the layer pointed to by the provided handle.
+    pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option<Arc<InMemoryLayer>> {
+        match handle {
+            InMemoryLayerHandle::Open { .. } => self.open_layer.clone(),
+            InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(),
+        }
+    }
+
    ///
    /// Divide the whole given range of keys into sub-ranges based on the latest
    /// image layer that covers each range at the specified lsn (inclusive).
@@ -869,6 +919,8 @@ impl LayerMap {

 #[cfg(test)]
 mod tests {
+    use pageserver_api::keyspace::KeySpace;
+
    use super::*;

    #[derive(Clone)]
@@ -895,15 +947,15 @@ mod tests {

    fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
-        let lhs: Vec<_> = lhs
+        let lhs: HashMap<SearchResult, KeySpace> = lhs
            .found
            .into_iter()
-            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
            .collect();
-        let rhs: Vec<_> = rhs
+        let rhs: HashMap<SearchResult, KeySpace> = rhs
            .found
            .into_iter()
-            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .map(|(search_result, accum)| (search_result, accum.to_keyspace()))
            .collect();

        assert_eq!(lhs, rhs);
@@ -923,7 +975,7 @@ mod tests {
                Some(res) => {
                    range_search_result
                        .found
-                        .entry(OrderedSearchResult(res))
+                        .entry(res)
                        .or_default()
                        .add_key(key);
                }
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -8,20 +8,11 @@
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client

-use std::io::{self};
-
-use anyhow::{ensure, Context};
-use pageserver_api::shard::TenantShardId;
+use anyhow::ensure;
 use serde::{de::Error, Deserialize, Serialize, Serializer};
-use thiserror::Error;
 use utils::bin_ser::SerializeError;
-use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};

-use crate::config::PageServerConf;
-use crate::virtual_file::VirtualFile;
-use crate::TEMP_FILE_SUFFIX;
-
 /// Use special format number to enable backward compatibility.
 const METADATA_FORMAT_VERSION: u16 = 4;

@@ -268,32 +259,6 @@ impl Serialize for TimelineMetadata {
    }
 }

-/// Save timeline metadata to file
-#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
-pub async fn save_metadata(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-    data: &TimelineMetadata,
-) -> anyhow::Result<()> {
-    let path = conf.metadata_path(tenant_shard_id, timeline_id);
-    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
-    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
-    VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
-        .await
-        .context("write metadata")?;
-    Ok(())
-}
-
-#[derive(Error, Debug)]
-pub enum LoadMetadataError {
-    #[error(transparent)]
-    Read(#[from] io::Error),
-
-    #[error(transparent)]
-    Decode(#[from] anyhow::Error),
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -42,7 +42,7 @@ use crate::tenant::config::{
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
-use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext::PathExt;
@@ -359,12 +359,6 @@ fn load_tenant_config(
        return Ok(None);
    }

-    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
-    if tenant_ignore_mark_file.exists() {
-        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
-        return Ok(None);
-    }
-
    let tenant_shard_id = match tenant_dir_path
        .file_name()
        .unwrap_or_default()
@@ -377,6 +371,59 @@ fn load_tenant_config(
        }
    };

+    // Clean up legacy `metadata` files.
+    // Doing it here because every single tenant directory is visited here.
+    // In any later code, there's different treatment of tenant dirs
+    // ... depending on whether the tenant is in re-attach response or not
+    // ... epending on whether the tenant is ignored or not
+    assert_eq!(
+        &conf.tenant_path(&tenant_shard_id),
+        &tenant_dir_path,
+        "later use of conf....path() methods would be dubious"
+    );
+    let timelines: Vec<TimelineId> = match conf.timelines_path(&tenant_shard_id).read_dir_utf8() {
+        Ok(iter) => {
+            let mut timelines = Vec::new();
+            for res in iter {
+                let p = res?;
+                let Some(timeline_id) = p.file_name().parse::<TimelineId>().ok() else {
+                    // skip any entries that aren't TimelineId, such as
+                    // - *.___temp dirs
+                    // - unfinished initdb uploads (test_non_uploaded_root_timeline_is_deleted_after_restart)
+                    continue;
+                };
+                timelines.push(timeline_id);
+            }
+            timelines
+        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => vec![],
+        Err(e) => return Err(anyhow::anyhow!(e)),
+    };
+    for timeline_id in timelines {
+        let timeline_path = &conf.timeline_path(&tenant_shard_id, &timeline_id);
+        let metadata_path = timeline_path.join(METADATA_FILE_NAME);
+        match std::fs::remove_file(&metadata_path) {
+            Ok(()) => {
+                crashsafe::fsync(timeline_path)
+                    .context("fsync timeline dir after removing legacy metadata file")?;
+                info!("removed legacy metadata file at {metadata_path}");
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                // something removed the file earlier, or it was never there
+                // We don't care, this software version doesn't write it again, so, we're good.
+            }
+            Err(e) => {
+                anyhow::bail!("remove legacy metadata file: {e}: {metadata_path}");
+            }
+        }
+    }
+
+    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+    if tenant_ignore_mark_file.exists() {
+        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+        return Ok(None);
+    }
+
    Ok(Some((
        tenant_shard_id,
        Tenant::load_tenant_config(conf, &tenant_shard_id),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -614,7 +614,7 @@ impl RemoteTimelineClient {
            metadata,
        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

@@ -654,7 +654,7 @@ impl RemoteTimelineClient {
            metadata.generation, metadata.shard
        );
        let op = UploadOp::UploadLayer(layer, metadata);
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -823,10 +823,14 @@ impl RemoteTimelineClient {
        }

        // schedule the actual deletions
+        if with_metadata.is_empty() {
+            // avoid scheduling the op & bumping the metric
+            return;
+        }
        let op = UploadOp::Delete(Delete {
            layers: with_metadata,
        });
-        self.calls_unfinished_metric_begin(&op);
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -1516,10 +1520,10 @@ impl RemoteTimelineClient {
                .await;
        }

-        self.calls_unfinished_metric_end(&task.op);
+        self.metric_end(&task.op);
    }

-    fn calls_unfinished_metric_impl(
+    fn metric_impl(
        &self,
        op: &UploadOp,
    ) -> Option<(
@@ -1556,17 +1560,17 @@ impl RemoteTimelineClient {
        Some(res)
    }

-    fn calls_unfinished_metric_begin(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
+    fn metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
            Some(x) => x,
            None => return,
        };
        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
-        guard.will_decrement_manually(); // in unfinished_ops_metric_end()
+        guard.will_decrement_manually(); // in metric_end(), see right below
    }

-    fn calls_unfinished_metric_end(&self, op: &UploadOp) {
-        let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) {
+    fn metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
            Some(x) => x,
            None => return,
        };
@@ -1651,7 +1655,7 @@ impl RemoteTimelineClient {

                // Tear down queued ops
                for op in qi.queued_operations.into_iter() {
-                    self.calls_unfinished_metric_end(&op);
+                    self.metric_end(&op);
                    // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                    // which is exactly what we want to happen.
                    drop(op);
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -81,27 +81,14 @@ pub async fn download_layer_file<'a>(
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;

-            let download = storage
-                .download(&remote_path, cancel)
-                .await
-                .with_context(|| {
-                    format!(
-                        "open a download stream for layer with remote storage path '{remote_path:?}'"
-                    )
-                })
-                .map_err(DownloadError::Other)?;
+            let download = storage.download(&remote_path, cancel).await?;

            let mut destination_file =
                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);

            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);

-            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
-                .await
-                .with_context(|| format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                ))
-                .map_err(DownloadError::Other);
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;

            match bytes_amount {
                Ok(bytes_amount) => {
@@ -113,7 +100,7 @@ pub async fn download_layer_file<'a>(
                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                    }

-                    Err(e)
+                    Err(e.into())
                }
            }
        },
@@ -251,10 +238,7 @@ async fn do_download_index_part(
            let stream = download.download_stream;
            let mut stream = StreamReader::new(stream);

-            tokio::io::copy_buf(&mut stream, &mut bytes)
-                .await
-                .with_context(|| format!("download index part at {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
+            tokio::io::copy_buf(&mut stream, &mut bytes).await?;

            Ok(bytes)
        },
@@ -434,14 +418,7 @@ pub(crate) async fn download_initdb_tar_zst(
            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
            let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);

-            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
-            // not without thinking carefully about how to recover safely from cancelling a write to
-            // local storage (e.g. by writing into a temp file as we do in download_layer)
-            // FIXME: flip the weird error wrapping
-            tokio::io::copy_buf(&mut download, &mut writer)
-                .await
-                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
+            tokio::io::copy_buf(&mut download, &mut writer).await?;

            let mut file = writer.into_inner();

--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -37,6 +37,7 @@ use crate::tenant::{
    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
 };

+use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
 use pageserver_api::shard::TenantShardId;
@@ -44,7 +45,7 @@ use rand::Rng;
 use remote_storage::{DownloadError, GenericRemoteStorage};

 use tokio_util::sync::CancellationToken;
-use tracing::{info_span, instrument, Instrument};
+use tracing::{info_span, instrument, warn, Instrument};
 use utils::{
    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
 };
@@ -438,8 +439,14 @@ impl From<std::io::Error> for UpdateError {
    fn from(value: std::io::Error) -> Self {
        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
            UpdateError::NoSpace
+        } else if value
+            .get_ref()
+            .and_then(|x| x.downcast_ref::<DownloadError>())
+            .is_some()
+        {
+            UpdateError::from(DownloadError::from(value))
        } else {
-            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
+            // An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue
            UpdateError::Other(anyhow::anyhow!(value))
        }
    }
@@ -484,14 +491,9 @@ impl<'a> TenantDownloader<'a> {
        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
        let heatmap_path_bg = heatmap_path.clone();
-        tokio::task::spawn_blocking(move || {
-            tokio::runtime::Handle::current().block_on(async move {
-                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
-            })
-        })
-        .await
-        .expect("Blocking task is never aborted")
-        .maybe_fatal_err(&context_msg)?;
+        VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes)
+            .await
+            .maybe_fatal_err(&context_msg)?;

        tracing::debug!("Wrote local heatmap to {}", heatmap_path);

@@ -672,20 +674,17 @@ impl<'a> TenantDownloader<'a> {
            .await
            {
                Ok(bytes) => bytes,
-                Err(e) => {
-                    if let DownloadError::NotFound = e {
-                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
-                        // This is harmless: continue to download the next layer. It is expected during compaction
-                        // GC.
-                        tracing::debug!(
-                            "Skipped downloading missing layer {}, raced with compaction/gc?",
-                            layer.name
-                        );
-                        continue;
-                    } else {
-                        return Err(e.into());
-                    }
+                Err(DownloadError::NotFound) => {
+                    // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                    // This is harmless: continue to download the next layer. It is expected during compaction
+                    // GC.
+                    tracing::debug!(
+                        "Skipped downloading missing layer {}, raced with compaction/gc?",
+                        layer.name
+                    );
+                    continue;
                }
+                Err(e) => return Err(e.into()),
            };

            if downloaded_bytes != layer.metadata.file_size {
@@ -775,19 +774,33 @@ async fn init_timeline_state(
        .await
        .fatal_err(&format!("Listing {timeline_path}"))
    {
-        let dentry_file_name = dentry.file_name();
-        let file_name = dentry_file_name.to_string_lossy();
-        let local_meta = dentry.metadata().await.fatal_err(&format!(
-            "Read metadata on {}",
-            dentry.path().to_string_lossy()
-        ));
+        let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else {
+            tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy());
+            continue;
+        };
+        let local_meta = dentry
+            .metadata()
+            .await
+            .fatal_err(&format!("Read metadata on {}", file_path));

-        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        let file_name = file_path.file_name().expect("created it from the dentry");
        if file_name == METADATA_FILE_NAME {
+            // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+            warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
+            continue;
+        } else if crate::is_temporary(&file_path) {
+            // Temporary files are frequently left behind from restarting during downloads
+            tracing::info!("Cleaning up temporary file {file_path}");
+            if let Err(e) = tokio::fs::remove_file(&file_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+            {
+                tracing::error!("Failed to remove temporary file {file_path}: {e}");
+            }
            continue;
        }

-        match LayerFileName::from_str(&file_name) {
+        match LayerFileName::from_str(file_name) {
            Ok(name) => {
                let remote_meta = heatmap_metadata.get(&name);
                match remote_meta {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,15 +8,21 @@ pub(crate) mod layer;
 mod layer_desc;

 use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::repository::Value;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::models::{
    LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
+use std::cmp::{Ordering, Reverse};
+use std::collections::hash_map::Entry;
+use std::collections::{BinaryHeap, HashMap};
 use std::ops::Range;
 use std::sync::Mutex;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
@@ -34,6 +40,11 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};

 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};

+use super::layer_map::InMemoryLayerHandle;
+use super::timeline::layer_manager::LayerManager;
+use super::timeline::GetVectoredError;
+use super::PageReconstructError;
+
 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
    T: PartialOrd<T>,
@@ -67,6 +78,277 @@ pub struct ValueReconstructState {
    pub img: Option<(Lsn, Bytes)>,
 }

+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
+pub(crate) enum ValueReconstructSituation {
+    Complete,
+    #[default]
+    Continue,
+}
+
+/// Reconstruct data accumulated for a single key during a vectored get
+#[derive(Debug, Default, Clone)]
+pub(crate) struct VectoredValueReconstructState {
+    pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
+    pub(crate) img: Option<(Lsn, Bytes)>,
+
+    situation: ValueReconstructSituation,
+}
+
+impl VectoredValueReconstructState {
+    fn get_cached_lsn(&self) -> Option<Lsn> {
+        self.img.as_ref().map(|img| img.0)
+    }
+}
+
+impl From<VectoredValueReconstructState> for ValueReconstructState {
+    fn from(mut state: VectoredValueReconstructState) -> Self {
+        // walredo expects the records to be descending in terms of Lsn
+        state.records.sort_by_key(|(lsn, _)| Reverse(*lsn));
+
+        ValueReconstructState {
+            records: state.records,
+            img: state.img,
+        }
+    }
+}
+
+/// Bag of data accumulated during a vectored get
+pub(crate) struct ValuesReconstructState {
+    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
+
+    keys_done: KeySpaceRandomAccum,
+}
+
+impl ValuesReconstructState {
+    pub(crate) fn new() -> Self {
+        Self {
+            keys: HashMap::new(),
+            keys_done: KeySpaceRandomAccum::new(),
+        }
+    }
+
+    /// Associate a key with the error which it encountered and mark it as done
+    pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) {
+        let previous = self.keys.insert(key, Err(err));
+        if let Some(Ok(state)) = previous {
+            if state.situation == ValueReconstructSituation::Continue {
+                self.keys_done.add_key(key);
+            }
+        }
+    }
+
+    /// Update the state collected for a given key.
+    /// Returns true if this was the last value needed for the key and false otherwise.
+    ///
+    /// If the key is done after the update, mark it as such.
+    pub(crate) fn update_key(
+        &mut self,
+        key: &Key,
+        lsn: Lsn,
+        value: Value,
+    ) -> ValueReconstructSituation {
+        let state = self
+            .keys
+            .entry(*key)
+            .or_insert(Ok(VectoredValueReconstructState::default()));
+
+        if let Ok(state) = state {
+            let key_done = match state.situation {
+                ValueReconstructSituation::Complete => unreachable!(),
+                ValueReconstructSituation::Continue => match value {
+                    Value::Image(img) => {
+                        state.img = Some((lsn, img));
+                        true
+                    }
+                    Value::WalRecord(rec) => {
+                        let reached_cache =
+                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
+                        let will_init = rec.will_init();
+                        state.records.push((lsn, rec));
+                        will_init || reached_cache
+                    }
+                },
+            };
+
+            if key_done && state.situation == ValueReconstructSituation::Continue {
+                state.situation = ValueReconstructSituation::Complete;
+                self.keys_done.add_key(*key);
+            }
+
+            state.situation
+        } else {
+            ValueReconstructSituation::Complete
+        }
+    }
+
+    /// Returns the Lsn at which this key is cached if one exists.
+    /// The read path should go no further than this Lsn for the given key.
+    pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option<Lsn> {
+        self.keys
+            .get(key)
+            .and_then(|k| k.as_ref().ok())
+            .and_then(|state| state.get_cached_lsn())
+    }
+
+    /// Returns the key space describing the keys that have
+    /// been marked as completed since the last call to this function.
+    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
+        self.keys_done.consume_keyspace()
+    }
+}
+
+impl Default for ValuesReconstructState {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Description of layer to be read - the layer map can turn
+/// this description into the actual layer.
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+pub(crate) enum ReadableLayerDesc {
+    Persistent {
+        desc: PersistentLayerDesc,
+        lsn_floor: Lsn,
+        lsn_ceil: Lsn,
+    },
+    InMemory {
+        handle: InMemoryLayerHandle,
+        lsn_ceil: Lsn,
+    },
+}
+
+/// Wraper for 'ReadableLayerDesc' sorted by Lsn
+#[derive(Debug)]
+struct ReadableLayerDescOrdered(ReadableLayerDesc);
+
+/// Data structure which maintains a fringe of layers for the
+/// read path. The fringe is the set of layers which intersects
+/// the current keyspace that the search is descending on.
+/// Each layer tracks the keyspace that intersects it.
+///
+/// The fringe must appear sorted by Lsn. Hence, it uses
+/// a two layer indexing scheme.
+#[derive(Debug)]
+pub(crate) struct LayerFringe {
+    layers_by_lsn: BinaryHeap<ReadableLayerDescOrdered>,
+    layers: HashMap<ReadableLayerDesc, KeySpace>,
+}
+
+impl LayerFringe {
+    pub(crate) fn new() -> Self {
+        LayerFringe {
+            layers_by_lsn: BinaryHeap::new(),
+            layers: HashMap::new(),
+        }
+    }
+
+    pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> {
+        let handle = match self.layers_by_lsn.pop() {
+            Some(h) => h,
+            None => return None,
+        };
+
+        let removed = self.layers.remove_entry(&handle.0);
+        match removed {
+            Some((layer, keyspace)) => Some((layer, keyspace)),
+            None => unreachable!("fringe internals are always consistent"),
+        }
+    }
+
+    pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) {
+        let entry = self.layers.entry(layer.clone());
+        match entry {
+            Entry::Occupied(mut entry) => {
+                entry.get_mut().merge(&keyspace);
+            }
+            Entry::Vacant(entry) => {
+                self.layers_by_lsn
+                    .push(ReadableLayerDescOrdered(entry.key().clone()));
+                entry.insert(keyspace);
+            }
+        }
+    }
+}
+
+impl Default for LayerFringe {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Ord for ReadableLayerDescOrdered {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil());
+        if ord == std::cmp::Ordering::Equal {
+            self.0
+                .get_lsn_floor()
+                .cmp(&other.0.get_lsn_floor())
+                .reverse()
+        } else {
+            ord
+        }
+    }
+}
+
+impl PartialOrd for ReadableLayerDescOrdered {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for ReadableLayerDescOrdered {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.get_lsn_floor() == other.0.get_lsn_floor()
+            && self.0.get_lsn_ceil() == other.0.get_lsn_ceil()
+    }
+}
+
+impl Eq for ReadableLayerDescOrdered {}
+
+impl ReadableLayerDesc {
+    pub(crate) fn get_lsn_floor(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
+            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
+        }
+    }
+
+    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
+        match self {
+            ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
+            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
+        }
+    }
+
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        layer_manager: &LayerManager,
+        keyspace: KeySpace,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        match self {
+            ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
+                let layer = layer_manager.get_from_desc(desc);
+                layer
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .await
+            }
+            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
+                let layer = layer_manager
+                    .layer_map()
+                    .get_in_memory_layer(handle)
+                    .unwrap();
+
+                layer
+                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .await
+            }
+        }
+    }
+}
+
 /// Return value from [`Layer::get_value_reconstruct_data`]
 #[derive(Clone, Copy, Debug)]
 pub enum ValueReconstructResult {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -35,16 +35,19 @@ use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -59,7 +62,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
+use super::{
+    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
+    ValuesReconstructState,
+};

 ///
 /// Header stored in the beginning of the file
@@ -818,6 +824,133 @@ impl DeltaLayerInner {
        }
    }

+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    //
+    // If the key is cached, go no further than the cached Lsn.
+    //
+    // Currently, the index is visited for each range, but this
+    // can be further optimised to visit the index only once.
+    pub(super) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            file,
+        );
+
+        let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut ignore_key = None;
+
+            // Scan the page versions backwards, starting from the last key in the range.
+            // to collect all the offsets at which need to be read.
+            let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
+            tree_reader
+                .visit(
+                    &end_key.0,
+                    VisitDirection::Backwards,
+                    |raw_key, value| {
+                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                        let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
+
+                        if entry_lsn >= end_lsn {
+                            return true;
+                        }
+
+                        if key < range.start {
+                            return false;
+                        }
+
+                        if key >= range.end {
+                            return true;
+                        }
+
+                        if Some(key) == ignore_key {
+                            return true;
+                        }
+
+                        if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
+                            if entry_lsn <= cached_lsn {
+                                return key != range.start;
+                            }
+                        }
+
+                        let blob_ref = BlobRef(value);
+                        let lsns_at = offsets.entry(key).or_default();
+                        lsns_at.push((entry_lsn, blob_ref.pos()));
+
+                        if blob_ref.will_init() {
+                            if key == range.start {
+                                return false;
+                            } else {
+                                ignore_key = Some(key);
+                                return true;
+                            }
+                        }
+
+                        true
+                    },
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
+                        .build(),
+                )
+                .await
+                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+        }
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::DeltaLayerValue)
+            .build();
+
+        let cursor = file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, lsns_at) in offsets {
+            for (lsn, block_offset) in lsns_at {
+                let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
+
+                if let Err(e) = res {
+                    reconstruct_state.on_key_error(
+                        key,
+                        PageReconstructError::from(anyhow!(e).context(format!(
+                            "Failed to read blob from virtual file {}",
+                            file.file.path
+                        ))),
+                    );
+
+                    break;
+                }
+
+                let value = Value::des(&buf);
+                if let Err(e) = value {
+                    reconstruct_state.on_key_error(
+                        key,
+                        PageReconstructError::from(anyhow!(e).context(format!(
+                            "Failed to deserialize file blob from virtual file {}",
+                            file.file.path
+                        ))),
+                    );
+
+                    break;
+                }
+
+                let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
+                if key_situation == ValueReconstructSituation::Complete {
+                    break;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
    pub(super) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -26,20 +26,22 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::PAGE_SZ;
-use crate::repository::{Key, KEY_SIZE};
+use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
-use crate::tenant::Timeline;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
-use anyhow::{bail, ensure, Context, Result};
+use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
@@ -59,7 +61,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
+use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer, ValuesReconstructState};

 ///
 /// Header stored in the beginning of the file
@@ -438,6 +440,74 @@ impl ImageLayerInner {
            Ok(ValueReconstructResult::Missing)
        }
    }
+
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    pub(super) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+
+        let mut offsets = Vec::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+            range.start.write_to_byte_slice(&mut search_key);
+
+            tree_reader
+                .visit(
+                    &search_key,
+                    VisitDirection::Forwards,
+                    |raw_key, value| {
+                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                        assert!(key >= range.start);
+
+                        if !range.contains(&key) {
+                            return false;
+                        }
+
+                        offsets.push((key, value));
+
+                        true
+                    },
+                    &RequestContextBuilder::extend(ctx)
+                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
+                        .build(),
+                )
+                .await
+                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
+        }
+
+        let ctx = &RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::ImageLayerValue)
+            .build();
+
+        let cursor = file.block_cursor();
+        let mut buf = Vec::new();
+        for (key, offset) in offsets {
+            let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
+            if let Err(e) = res {
+                reconstruct_state.on_key_error(
+                    key,
+                    PageReconstructError::from(anyhow!(e).context(format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path
+                    ))),
+                );
+
+                continue;
+            }
+
+            let blob = Bytes::copy_from_slice(buf.as_slice());
+            reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
+        }
+
+        Ok(())
+    }
 }

 /// A builder object for constructing a new image layer.
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -9,13 +9,15 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::repository::{Key, Value};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
-use crate::tenant::Timeline;
+use crate::tenant::storage_layer::ValueReconstructResult;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 use crate::walrecord;
-use anyhow::{ensure, Result};
+use anyhow::{anyhow, ensure, Result};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::{Arc, OnceLock};
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
@@ -25,7 +27,10 @@ use std::fmt::Write as _;
 use std::ops::Range;
 use tokio::sync::{RwLock, RwLockWriteGuard};

-use super::{DeltaLayerWriter, ResidentLayer};
+use super::{
+    DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState,
+    ValuesReconstructState,
+};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
@@ -202,6 +207,91 @@ impl InMemoryLayer {
            Ok(ValueReconstructResult::Complete)
        }
    }
+
+    // Look up the keys in the provided keyspace and update
+    // the reconstruct state with whatever is found.
+    //
+    // If the key is cached, go no further than the cached Lsn.
+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let ctx = RequestContextBuilder::extend(ctx)
+            .page_content_kind(PageContentKind::InMemoryLayer)
+            .build();
+
+        let inner = self.inner.read().await;
+        let reader = inner.file.block_cursor();
+
+        #[derive(Eq, PartialEq, Ord, PartialOrd)]
+        struct BlockRead {
+            key: Key,
+            lsn: Lsn,
+            block_offset: u64,
+        }
+
+        let mut planned_block_reads = BinaryHeap::new();
+
+        for range in keyspace.ranges.iter() {
+            let mut key = range.start;
+            while key < range.end {
+                if let Some(vec_map) = inner.index.get(&key) {
+                    let lsn_range = match reconstruct_state.get_cached_lsn(&key) {
+                        Some(cached_lsn) => (cached_lsn + 1)..end_lsn,
+                        None => self.start_lsn..end_lsn,
+                    };
+
+                    let slice = vec_map.slice_range(lsn_range);
+                    for (entry_lsn, pos) in slice.iter().rev() {
+                        planned_block_reads.push(BlockRead {
+                            key,
+                            lsn: *entry_lsn,
+                            block_offset: *pos,
+                        });
+                    }
+                }
+
+                key = key.next();
+            }
+        }
+
+        let keyspace_size = keyspace.total_size();
+
+        let mut completed_keys = HashSet::new();
+        while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
+            let block_read = planned_block_reads.pop().unwrap();
+            if completed_keys.contains(&block_read.key) {
+                continue;
+            }
+
+            let buf = reader.read_blob(block_read.block_offset, &ctx).await;
+            if let Err(e) = buf {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let value = Value::des(&buf.unwrap());
+            if let Err(e) = value {
+                reconstruct_state
+                    .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
+                completed_keys.insert(block_read.key);
+                continue;
+            }
+
+            let key_situation =
+                reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
+            if key_situation == ValueReconstructSituation::Complete {
+                completed_keys.insert(block_read.key);
+            }
+        }
+
+        Ok(())
+    }
 }

 impl std::fmt::Display for InMemoryLayer {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
@@ -16,13 +17,14 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
 use super::{
    AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc,
-    ValueReconstructResult, ValueReconstructState,
+    ValueReconstructResult, ValueReconstructState, ValuesReconstructState,
 };

 use utils::generation::Generation;
@@ -262,6 +264,29 @@ impl Layer {
            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
    }

+    pub(crate) async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_data: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let layer = self
+            .0
+            .get_or_maybe_download(true, Some(ctx))
+            .await
+            .map_err(|err| GetVectoredError::Other(anyhow::anyhow!(err)))?;
+
+        self.0
+            .access_stats
+            .record_access(LayerAccessKind::GetValueReconstructData, ctx);
+
+        layer
+            .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
+            .await
+    }
+
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
@@ -1177,7 +1202,7 @@ pub(crate) enum EvictionError {

 /// Error internal to the [`LayerInner::get_or_maybe_download`]
 #[derive(Debug, thiserror::Error)]
-enum DownloadError {
+pub(crate) enum DownloadError {
    #[error("timeline has already shutdown")]
    TimelineShutdown,
    #[error("no remote storage configured")]
@@ -1337,6 +1362,28 @@ impl DownloadedLayer {
        }
    }

+    async fn get_values_reconstruct_data(
+        &self,
+        keyspace: KeySpace,
+        end_lsn: Lsn,
+        reconstruct_data: &mut ValuesReconstructState,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
+            Delta(d) => {
+                d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
+                    .await
+            }
+            Image(i) => {
+                i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx)
+                    .await
+            }
+        }
+    }
+
    async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
        use LayerKind::*;
        match self.get(owner, ctx).await? {
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -15,7 +15,7 @@ use utils::id::TenantId;
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
-#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
+#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)]
 pub struct PersistentLayerDesc {
    pub tenant_shard_id: TenantShardId,
    pub timeline_id: TimelineId,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,7 +16,7 @@ use futures::stream::StreamExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    keyspace::{key_range_size, KeySpaceAccum},
+    keyspace::KeySpaceAccum,
    models::{
        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
        LayerMapInfo, TimelineState,
@@ -54,7 +54,7 @@ use crate::pgdatadir_mapping::DirectoryKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
-    metadata::{save_metadata, TimelineMetadata},
+    metadata::TimelineMetadata,
    par_fsync,
 };
 use crate::{
@@ -67,7 +67,7 @@ use crate::{
    tenant::storage_layer::{
        AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
        LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
-        ValueReconstructState,
+        ValueReconstructState, ValuesReconstructState,
    },
 };
 use crate::{
@@ -76,7 +76,7 @@ use crate::{
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};

 use crate::config::PageServerConf;
-use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
+use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
@@ -111,11 +111,11 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

-use super::config::TenantConf;
-use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
+use super::{config::TenantConf, storage_layer::ReadableLayerDesc};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -210,17 +210,6 @@ pub struct Timeline {
    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
    pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,

-    /// Set of key ranges which should be covered by image layers to
-    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
-    /// It is used by compaction task when it checks if new image layer should be created.
-    /// Newly created image layer doesn't help to remove the delta layer, until the
-    /// newly created image layer falls off the PITR horizon. So on next GC cycle,
-    /// gc_timeline may still want the new image layer to be created. To avoid redundant
-    /// image layers creation we should check if image layer exists but beyond PITR horizon.
-    /// This is why we need remember GC cutoff LSN.
-    ///
-    wanted_image_layers: Mutex<Option<(Lsn, KeySpace)>>,
-
    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
    last_freeze_ts: RwLock<Instant>,
@@ -303,7 +292,7 @@ pub struct Timeline {
    pub initdb_lsn: Lsn,

    /// When did we last calculate the partitioning?
-    partitioning: Mutex<(KeyPartitioning, Lsn)>,
+    partitioning: tokio::sync::Mutex<(KeyPartitioning, Lsn)>,

    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,
@@ -345,7 +334,7 @@ pub struct Timeline {
    ///
    /// Must only be taken in two places:
    /// - [`Timeline::compact`] (this file)
-    /// - [`delete::delete_local_layer_files`]
+    /// - [`delete::delete_local_timeline_directory`]
    ///
    /// Timeline deletion will acquire both compaction and gc locks in whatever order.
    compaction_lock: tokio::sync::Mutex<()>,
@@ -354,7 +343,7 @@ pub struct Timeline {
    ///
    /// Must only be taken in two places:
    /// - [`Timeline::gc`] (this file)
-    /// - [`delete::delete_local_layer_files`]
+    /// - [`delete::delete_local_timeline_directory`]
    ///
    /// Timeline deletion will acquire both compaction and gc locks in whatever order.
    gc_lock: tokio::sync::Mutex<()>,
@@ -472,6 +461,15 @@ pub(crate) enum GetVectoredError {

    #[error("Requested at invalid LSN: {0}")]
    InvalidLsn(Lsn),
+
+    #[error("Requested key {0} not found")]
+    MissingKey(Key),
+
+    #[error(transparent)]
+    GetReadyAncestorError(GetReadyAncestorError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
 }

 #[derive(thiserror::Error, Debug)]
@@ -579,6 +577,23 @@ impl From<GetReadyAncestorError> for PageReconstructError {
    }
 }

+#[derive(
+    Eq,
+    PartialEq,
+    Debug,
+    Copy,
+    Clone,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+pub enum GetVectoredImpl {
+    Sequential,
+    Vectored,
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -708,7 +723,7 @@ impl Timeline {
    /// which actually vectorizes the read path.
    pub(crate) async fn get_vectored(
        &self,
-        key_ranges: &[Range<Key>],
+        keyspace: KeySpace,
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
@@ -716,10 +731,7 @@ impl Timeline {
            return Err(GetVectoredError::InvalidLsn(lsn));
        }

-        let key_count = key_ranges
-            .iter()
-            .map(|range| key_range_size(range) as u64)
-            .sum();
+        let key_count = keyspace.total_size().try_into().unwrap();
        if key_count > Timeline::MAX_GET_VECTORED_KEYS {
            return Err(GetVectoredError::Oversized(key_count));
        }
@@ -728,33 +740,163 @@ impl Timeline {
            .throttle(ctx, key_count as usize)
            .await;

-        let _timer = crate::metrics::GET_VECTORED_LATENCY
-            .for_task_kind(ctx.task_kind())
-            .map(|t| t.start_timer());
-
-        let mut values = BTreeMap::new();
-        for range in key_ranges {
+        for range in &keyspace.ranges {
            let mut key = range.start;
            while key != range.end {
                assert!(!self.shard_identity.is_key_disposable(&key));
-
-                let block = self.get(key, lsn, ctx).await;
-
-                if matches!(
-                    block,
-                    Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
-                ) {
-                    return Err(GetVectoredError::Cancelled);
-                }
-
-                values.insert(key, block);
                key = key.next();
            }
        }

+        trace!(
+            "get vectored request for {:?}@{} from task kind {:?} will use {} implementation",
+            keyspace,
+            lsn,
+            ctx.task_kind(),
+            self.conf.get_vectored_impl
+        );
+
+        let _timer = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|t| t.start_timer());
+
+        match self.conf.get_vectored_impl {
+            GetVectoredImpl::Sequential => {
+                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
+            }
+            GetVectoredImpl::Vectored => {
+                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;
+
+                self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                    .await;
+
+                vectored_res
+            }
+        }
+    }
+
+    pub(super) async fn get_vectored_sequential_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut values = BTreeMap::new();
+        for range in keyspace.ranges {
+            let mut key = range.start;
+            while key != range.end {
+                let block = self.get(key, lsn, ctx).await;
+
+                use PageReconstructError::*;
+                match block {
+                    Err(Cancelled | AncestorStopping(_)) => {
+                        return Err(GetVectoredError::Cancelled)
+                    }
+                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
+                        return Err(GetVectoredError::MissingKey(key))
+                    }
+                    _ => {
+                        values.insert(key, block);
+                        key = key.next();
+                    }
+                }
+            }
+        }
+
        Ok(values)
    }

+    pub(super) async fn get_vectored_impl(
+        &self,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        let mut reconstruct_state = ValuesReconstructState::new();
+
+        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
+            .await?;
+
+        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        for (key, res) in reconstruct_state.keys {
+            match res {
+                Err(err) => {
+                    results.insert(key, Err(err));
+                }
+                Ok(state) => {
+                    let state = ValueReconstructState::from(state);
+
+                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
+                    results.insert(key, reconstruct_res);
+                }
+            }
+        }
+
+        Ok(results)
+    }
+
+    pub(super) async fn validate_get_vectored_impl(
+        &self,
+        vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
+        keyspace: KeySpace,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) {
+        let sequential_res = self
+            .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx)
+            .await;
+
+        fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool {
+            use GetVectoredError::*;
+            match (lhs, rhs) {
+                (Cancelled, Cancelled) => true,
+                (_, Cancelled) => true,
+                (Oversized(l), Oversized(r)) => l == r,
+                (InvalidLsn(l), InvalidLsn(r)) => l == r,
+                (MissingKey(l), MissingKey(r)) => l == r,
+                (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true,
+                (Other(_), Other(_)) => true,
+                _ => false,
+            }
+        }
+
+        match (&sequential_res, vectored_res) {
+            (Err(seq_err), Ok(_)) => {
+                panic!(concat!("Sequential get failed with {}, but vectored get did not",
+                               " - keyspace={:?} lsn={}"),
+                       seq_err, keyspace, lsn) },
+            (Ok(_), Err(vec_err)) => {
+                panic!(concat!("Vectored get failed with {}, but sequential get did not",
+                               " - keyspace={:?} lsn={}"),
+                       vec_err, keyspace, lsn) },
+            (Err(seq_err), Err(vec_err)) => {
+                assert!(errors_match(seq_err, vec_err),
+                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
+            (Ok(seq_values), Ok(vec_values)) => {
+                seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| {
+                    assert_eq!(seq_key, vec_key);
+                    match (seq_res, vec_res) {
+                        (Ok(seq_blob), Ok(vec_blob)) => {
+                            assert_eq!(seq_blob, vec_blob,
+                                       "Image mismatch for key {seq_key} - keyspace={keyspace:?} lsn={lsn}");
+                        },
+                        (Err(err), Ok(_)) => {
+                            panic!(
+                                concat!("Sequential get failed with {} for key {}, but vectored get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Ok(_), Err(err)) => {
+                            panic!(
+                                concat!("Vectored get failed with {} for key {}, but sequential get did not",
+                                        " - keyspace={:?} lsn={}"),
+                                err, seq_key, keyspace, lsn) },
+                        (Err(_), Err(_)) => {}
+                    }
+                })
+            }
+        }
+    }
+
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
@@ -1363,13 +1505,6 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }

-    fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf.clone();
-        tenant_conf
-            .gc_feedback
-            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
-    }
-
    pub(super) fn tenant_conf_updated(&self) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
@@ -1443,7 +1578,6 @@ impl Timeline {
                shard_identity,
                pg_version,
                layers: Default::default(),
-                wanted_image_layers: Mutex::new(None),

                walredo_mgr,
                walreceiver: Mutex::new(None),
@@ -1506,7 +1640,7 @@ impl Timeline {
                    // initial logical size is 0.
                    LogicalSize::empty_initial()
                },
-                partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
+                partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
                repartition_threshold: 0,

                last_received_wal: Mutex::new(None),
@@ -1692,7 +1826,11 @@ impl Timeline {
                            discovered_layers.push((file_name, file_size));
                            continue;
                        }
-                        Discovered::Metadata | Discovered::IgnoredBackup => {
+                        Discovered::Metadata => {
+                            warn!("found legacy metadata file, these should have been removed in load_tenant_config");
+                            continue;
+                        }
+                        Discovered::IgnoredBackup => {
                            continue;
                        }
                        Discovered::Unknown(file_name) => {
@@ -2199,7 +2337,7 @@ impl Timeline {
        fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| {
            if !self
                .conf
-                .metadata_path(&self.tenant_shard_id, &self.timeline_id)
+                .timeline_path(&self.tenant_shard_id, &self.timeline_id)
                .exists()
            {
                error!("timeline-calculate-logical-size-pre metadata file does not exist")
@@ -2547,6 +2685,170 @@ impl Timeline {
        }
    }

+    /// Get the data needed to reconstruct all keys in the provided keyspace
+    ///
+    /// The algorithm is as follows:
+    /// 1.   While some keys are still not done and there's a timeline to visit:
+    /// 2.   Visit the timeline (see [`Timeline::get_vectored_reconstruct_data_timeline`]:
+    /// 2.1: Build the fringe for the current keyspace
+    /// 2.2  Visit the newest layer from the fringe to collect all values for the range it
+    ///      intersects
+    /// 2.3. Pop the timeline from the fringe
+    /// 2.4. If the fringe is empty, go back to 1
+    async fn get_vectored_reconstruct_data(
+        &self,
+        mut keyspace: KeySpace,
+        request_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<(), GetVectoredError> {
+        let mut timeline_owned: Arc<Timeline>;
+        let mut timeline = self;
+
+        let mut cont_lsn = Lsn(request_lsn.0 + 1);
+
+        loop {
+            if self.cancel.is_cancelled() {
+                return Err(GetVectoredError::Cancelled);
+            }
+
+            let completed = Self::get_vectored_reconstruct_data_timeline(
+                timeline,
+                keyspace.clone(),
+                cont_lsn,
+                reconstruct_state,
+                &self.cancel,
+                ctx,
+            )
+            .await?;
+
+            keyspace.remove_overlapping_with(&completed);
+            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
+                break;
+            }
+
+            cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+            timeline_owned = timeline
+                .get_ready_ancestor_timeline(ctx)
+                .await
+                .map_err(GetVectoredError::GetReadyAncestorError)?;
+            timeline = &*timeline_owned;
+        }
+
+        if keyspace.total_size() != 0 {
+            return Err(GetVectoredError::MissingKey(keyspace.start().unwrap()));
+        }
+
+        Ok(())
+    }
+
+    /// Collect the reconstruct data for a ketspace from the specified timeline.
+    ///
+    /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
+    /// the current keyspace. The current keyspace of the search at any given timeline
+    /// is the original keyspace minus all the keys that have been completed minus
+    /// any keys for which we couldn't find an intersecting layer. It's not tracked explicitly,
+    /// but if you merge all the keyspaces in the fringe, you get the "current keyspace".
+    ///
+    /// This is basically a depth-first search visitor implementation where a vertex
+    /// is the (layer, lsn range, key space) tuple. The fringe acts as the stack.
+    ///
+    /// At each iteration pop the top of the fringe (the layer with the highest Lsn)
+    /// and get all the required reconstruct data from the layer in one go.
+    async fn get_vectored_reconstruct_data_timeline(
+        timeline: &Timeline,
+        keyspace: KeySpace,
+        mut cont_lsn: Lsn,
+        reconstruct_state: &mut ValuesReconstructState,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<KeySpace, GetVectoredError> {
+        let mut unmapped_keyspace = keyspace.clone();
+        let mut fringe = LayerFringe::new();
+
+        let mut completed_keyspace = KeySpace::default();
+
+        // Hold the layer map whilst visiting the timeline to prevent
+        // compaction, eviction and flushes from rendering the layers unreadable.
+        //
+        // TODO: Do we actually need to do this? In theory holding on
+        // to [`tenant::storage_layer::Layer`] should be enough. However,
+        // [`Timeline::get`] also holds the lock during IO, so more investigation
+        // is needed.
+        let guard = timeline.layers.read().await;
+        let layers = guard.layer_map();
+
+        'outer: loop {
+            if cancel.is_cancelled() {
+                return Err(GetVectoredError::Cancelled);
+            }
+
+            let keys_done_last_step = reconstruct_state.consume_done_keys();
+            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
+            completed_keyspace.merge(&keys_done_last_step);
+
+            let in_memory_layer = layers.find_in_memory_layer(|l| {
+                let start_lsn = l.get_lsn_range().start;
+                cont_lsn > start_lsn
+            });
+
+            match in_memory_layer {
+                Some(l) => {
+                    fringe.update(
+                        ReadableLayerDesc::InMemory {
+                            handle: l,
+                            lsn_ceil: cont_lsn,
+                        },
+                        unmapped_keyspace.clone(),
+                    );
+                }
+                None => {
+                    for range in unmapped_keyspace.ranges.iter() {
+                        let results = match layers.range_search(range.clone(), cont_lsn) {
+                            Some(res) => res,
+                            None => {
+                                break 'outer;
+                            }
+                        };
+
+                        results
+                            .found
+                            .into_iter()
+                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                (
+                                    ReadableLayerDesc::Persistent {
+                                        desc: (*layer).clone(),
+                                        lsn_floor,
+                                        lsn_ceil: cont_lsn,
+                                    },
+                                    keyspace_accum.to_keyspace(),
+                                )
+                            })
+                            .for_each(|(layer, keyspace)| fringe.update(layer, keyspace));
+                    }
+                }
+            }
+
+            if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() {
+                layer_to_read
+                    .get_values_reconstruct_data(
+                        &guard,
+                        keyspace_to_read.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
+                    .await?;
+
+                unmapped_keyspace = keyspace_to_read;
+                cont_lsn = layer_to_read.get_lsn_floor();
+            } else {
+                break;
+            }
+        }
+
+        Ok(completed_keyspace)
+    }
+
    /// # Cancel-safety
    ///
    /// This method is cancellation-safe.
@@ -2890,7 +3192,7 @@ impl Timeline {
        // The new on-disk layers are now in the layer map. We can remove the
        // in-memory layer from the map now. The flushed layer is stored in
        // the mapping in `create_delta_layer`.
-        let metadata = {
+        {
            let mut guard = self.layers.write().await;

            if self.cancel.is_cancelled() {
@@ -2904,9 +3206,7 @@ impl Timeline {
                self.disk_consistent_lsn.store(disk_consistent_lsn);

                // Schedule remote uploads that will reflect our new disk_consistent_lsn
-                Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?)
-            } else {
-                None
+                self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
            }
            // release lock on 'layers'
        };
@@ -2921,22 +3221,6 @@ impl Timeline {
        // This failpoint is used by another test case `test_pageserver_recovery`.
        fail_point!("flush-frozen-exit");

-        // Update the metadata file, with new 'disk_consistent_lsn'
-        //
-        // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
-        // *all* the layers, to avoid fsyncing the file multiple times.
-
-        // If we updated our disk_consistent_lsn, persist the updated metadata to local disk.
-        if let Some(metadata) = metadata {
-            save_metadata(
-                self.conf,
-                &self.tenant_shard_id,
-                &self.timeline_id,
-                &metadata,
-            )
-            .await
-            .context("save_metadata")?;
-        }
        Ok(())
    }

@@ -2992,25 +3276,6 @@ impl Timeline {
        Ok(metadata)
    }

-    async fn update_metadata_file(
-        &self,
-        disk_consistent_lsn: Lsn,
-        layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<()> {
-        let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?;
-
-        save_metadata(
-            self.conf,
-            &self.tenant_shard_id,
-            &self.timeline_id,
-            &metadata,
-        )
-        .await
-        .context("save_metadata")?;
-
-        Ok(())
-    }
-
    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
        if let Some(remote_client) = &self.remote_client {
            remote_client
@@ -3089,30 +3354,34 @@ impl Timeline {
        flags: EnumSet<CompactFlags>,
        ctx: &RequestContext,
    ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
-        {
-            let partitioning_guard = self.partitioning.lock().unwrap();
-            let distance = lsn.0 - partitioning_guard.1 .0;
-            if partitioning_guard.1 != Lsn(0)
-                && distance <= self.repartition_threshold
-                && !flags.contains(CompactFlags::ForceRepartition)
-            {
-                debug!(
-                    distance,
-                    threshold = self.repartition_threshold,
-                    "no repartitioning needed"
-                );
-                return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
-            }
+        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
+            // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
+            // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
+            // and hence before the compaction task starts.
+            anyhow::bail!("repartition() called concurrently, this should not happen");
+        };
+        if lsn < partitioning_guard.1 {
+            anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
        }
+
+        let distance = lsn.0 - partitioning_guard.1 .0;
+        if partitioning_guard.1 != Lsn(0)
+            && distance <= self.repartition_threshold
+            && !flags.contains(CompactFlags::ForceRepartition)
+        {
+            debug!(
+                distance,
+                threshold = self.repartition_threshold,
+                "no repartitioning needed"
+            );
+            return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
+        }
+
        let keyspace = self.collect_keyspace(lsn, ctx).await?;
        let partitioning = keyspace.partition(partition_size);

-        let mut partitioning_guard = self.partitioning.lock().unwrap();
-        if lsn > partitioning_guard.1 {
-            *partitioning_guard = (partitioning, lsn);
-        } else {
-            warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless");
-        }
+        *partitioning_guard = (partitioning, lsn);
+
        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
    }

@@ -3124,31 +3393,6 @@ impl Timeline {
        let layers = guard.layer_map();

        let mut max_deltas = 0;
-        {
-            let wanted_image_layers = self.wanted_image_layers.lock().unwrap();
-            if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers {
-                let img_range =
-                    partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
-                if wanted.overlaps(&img_range) {
-                    //
-                    // gc_timeline only pays attention to image layers that are older than the GC cutoff,
-                    // but create_image_layers creates image layers at last-record-lsn.
-                    // So it's possible that gc_timeline wants a new image layer to be created for a key range,
-                    // but the range is already covered by image layers at more recent LSNs. Before we
-                    // create a new image layer, check if the range is already covered at more recent LSNs.
-                    if !layers
-                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
-                    {
-                        debug!(
-                            "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
-                            img_range.start, img_range.end, cutoff_lsn, lsn
-                        );
-                        return true;
-                    }
-                }
-            }
-        }
-
        for part_range in &partition.ranges {
            let image_coverage = layers.image_coverage(part_range, lsn);
            for (img_range, last_img) in image_coverage {
@@ -3263,7 +3507,7 @@ impl Timeline {
                        || last_key_in_range
                    {
                        let results = self
-                            .get_vectored(&key_request_accum.consume_keyspace().ranges, lsn, ctx)
+                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
                            .await?;

                        for (img_key, img) in results {
@@ -3319,12 +3563,6 @@ impl Timeline {
                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
            }
        }
-        // All layers that the GC wanted us to create have now been created.
-        //
-        // It's possible that another GC cycle happened while we were compacting, and added
-        // something new to wanted_image_layers, and we now clear that before processing it.
-        // That's OK, because the next GC iteration will put it back in.
-        *self.wanted_image_layers.lock().unwrap() = None;

        // Sync the new layer to disk before adding it to the layer map, to make sure
        // we don't garbage collect something based on the new layer, before it has
@@ -4234,7 +4472,6 @@ impl Timeline {
        debug!("retain_lsns: {:?}", retain_lsns);

        let mut layers_to_remove = Vec::new();
-        let mut wanted_image_layers = KeySpaceRandomAccum::default();

        // Scan all layers in the timeline (remote or on-disk).
        //
@@ -4316,15 +4553,6 @@ impl Timeline {
                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
            {
                debug!("keeping {} because it is the latest layer", l.filename());
-                // Collect delta key ranges that need image layers to allow garbage
-                // collecting the layers.
-                // It is not so obvious whether we need to propagate information only about
-                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
-                // But image layers are in any case less sparse than delta layers. Also we need some
-                // protection from replacing recent image layers with new one after each GC iteration.
-                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) {
-                    wanted_image_layers.add_range(l.get_key_range());
-                }
                result.layers_not_updated += 1;
                continue 'outer;
            }
@@ -4337,24 +4565,13 @@ impl Timeline {
            );
            layers_to_remove.push(l);
        }
-        self.wanted_image_layers
-            .lock()
-            .unwrap()
-            .replace((new_gc_cutoff, wanted_image_layers.to_keyspace()));

        if !layers_to_remove.is_empty() {
-            // Persist the new GC cutoff value in the metadata file, before
-            // we actually remove anything.
-            //
-            // This does not in fact have any effect as we no longer consider local metadata unless
-            // running without remote storage.
-            //
+            // Persist the new GC cutoff value before we actually remove anything.
            // This unconditionally schedules also an index_part.json update, even though, we will
            // be doing one a bit later with the unlinked gc'd layers.
-            //
-            // TODO: remove when implementing <https://github.com/neondatabase/neon/issues/4099>.
-            self.update_metadata_file(self.disk_consistent_lsn.load(), None)
-                .await?;
+            let disk_consistent_lsn = self.disk_consistent_lsn.load();
+            self.schedule_uploads(disk_consistent_lsn, None)?;

            let gc_layers = layers_to_remove
                .iter()
@@ -4875,11 +5092,15 @@ impl<'a> TimelineWriter<'a> {

        // Rolling the open layer can be triggered by:
        // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that
-        //    the safekeepers need to store.
+        //    the safekeepers need to store.  For sharded tenants, we multiply by shard count to
+        //    account for how writes are distributed across shards: we expect each node to consume
+        //    1/count of the LSN on average.
        // 2. The size of the currently open layer.
        // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught
        //    up and suspend activity.
-        if distance >= self.get_checkpoint_distance().into() {
+        if distance
+            >= self.get_checkpoint_distance() as i128 * self.shard_identity.count.count() as i128
+        {
            info!(
                "Will roll layer at {} with layer size {} due to LSN distance ({})",
                lsn, state.current_size, distance
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument};
+use tracing::{debug, error, info, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId};

 use crate::{
@@ -124,7 +124,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
 /// No timeout here, GC & Compaction should be responsive to the
 /// `TimelineState::Stopping` change.
 // pub(super): documentation link
-pub(super) async fn delete_local_layer_files(
+pub(super) async fn delete_local_timeline_directory(
    conf: &PageServerConf,
    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
@@ -149,8 +149,6 @@ pub(super) async fn delete_local_layer_files(
    // NB: This need not be atomic because the deleted flag in the IndexPart
    // will be observed during tenant/timeline load. The deletion will be resumed there.
    //
-    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
-    //
    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
    // This can happen if we're called a second time, e.g.,
    // because of a previous failure/cancellation at/after
@@ -158,72 +156,21 @@ pub(super) async fn delete_local_layer_files(
    //
    // ErrorKind::NotFound can also happen if we race with tenant detach, because,
    // no locks are shared.
-    //
-    // For now, log and continue.
-    // warn! level is technically not appropriate for the
-    // first case because we should expect retries to happen.
-    // But the error is so rare, it seems better to get attention if it happens.
-    //
-    // Note that metadata removal is skipped, this is not technically needed,
-    // but allows to reuse timeline loading code during resumed deletion.
-    // (we always expect that metadata is in place when timeline is being loaded)
+    tokio::fs::remove_dir_all(local_timeline_directory)
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove local timeline directory")?;

-    #[cfg(feature = "testing")]
-    let mut counter = 0;
-
-    // Timeline directory may not exist if we failed to delete mark file and request was retried.
-    if !local_timeline_directory.exists() {
-        return Ok(());
-    }
-
-    let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
-
-    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
-        #[cfg(feature = "testing")]
-        {
-            counter += 1;
-            if counter == 2 {
-                fail::fail_point!("timeline-delete-during-rm", |_| {
-                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
-                });
-            }
-        }
-
-        let entry = entry?;
-        if entry.path() == metadata_path {
-            debug!("found metadata, skipping");
-            continue;
-        }
-
-        if entry.path() == local_timeline_directory {
-            // Keeping directory because metedata file is still there
-            debug!("found timeline dir itself, skipping");
-            continue;
-        }
-
-        let metadata = match entry.metadata() {
-            Ok(metadata) => metadata,
-            Err(e) => {
-                if crate::is_walkdir_io_not_found(&e) {
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        path=?entry.path().display(),
-                        "got not found err while removing timeline dir, proceeding anyway"
-                    );
-                    continue;
-                }
-                anyhow::bail!(e);
-            }
-        };
-
-        if metadata.is_dir() {
-            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
-            tokio::fs::remove_dir(entry.path()).await
-        } else {
-            tokio::fs::remove_file(entry.path()).await
-        }
-        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
-    }
+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let timeline_path = conf.timelines_path(&tenant_shard_id);
+    crashsafe::fsync_async(timeline_path)
+        .await
+        .context("fsync_pre_mark_remove")?;

    info!("finished deleting layer files, releasing locks");
    drop(guards);
@@ -254,39 +201,6 @@ async fn cleanup_remaining_timeline_fs_traces(
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> anyhow::Result<()> {
-    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove metadata")?;
-
-    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-after-rm-metadata"
-        ))?
-    });
-
-    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("timeline dir")?;
-
-    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
-    });
-
-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let timeline_path = conf.timelines_path(&tenant_shard_id);
-    crashsafe::fsync_async(timeline_path)
-        .await
-        .context("fsync_pre_mark_remove")?;
-
    // Remove delete mark
    // TODO: once we are confident that no more exist in the field, remove this
    // line.  It cleans up a legacy marker file that might in rare cases be present.
@@ -552,15 +466,12 @@ impl DeleteTimelineFlow {
        tenant: &Tenant,
        timeline: &Timeline,
    ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
+        delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;

        delete_remote_layers_and_index(timeline).await?;

        pausable_failpoint!("in_progress_delete");

-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
-            .await?;
-
        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

        *guard = Self::Finished;
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -85,6 +85,7 @@ impl Timeline {
            let policy = self.get_eviction_policy();
            let period = match policy {
                EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
+                EvictionPolicy::OnlyImitiate(lat) => lat.period,
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
            if random_init_delay(period, &cancel).await.is_err() {
@@ -119,33 +120,45 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> ControlFlow<(), Instant> {
        debug!("eviction iteration: {policy:?}");
-        match policy {
+        let start = Instant::now();
+        let (period, threshold) = match policy {
            EvictionPolicy::NoEviction => {
                // check again in 10 seconds; XXX config watch mechanism
-                ControlFlow::Continue(Instant::now() + Duration::from_secs(10))
+                return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
            }
            EvictionPolicy::LayerAccessThreshold(p) => {
-                let start = Instant::now();
                match self.eviction_iteration_threshold(p, cancel, ctx).await {
                    ControlFlow::Break(()) => return ControlFlow::Break(()),
                    ControlFlow::Continue(()) => (),
                }
-                let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(
-                    elapsed,
-                    p.period,
-                    BackgroundLoopKind::Eviction,
-                );
-                crate::metrics::EVICTION_ITERATION_DURATION
-                    .get_metric_with_label_values(&[
-                        &format!("{}", p.period.as_secs()),
-                        &format!("{}", p.threshold.as_secs()),
-                    ])
-                    .unwrap()
-                    .observe(elapsed.as_secs_f64());
-                ControlFlow::Continue(start + p.period)
+                (p.period, p.threshold)
            }
-        }
+            EvictionPolicy::OnlyImitiate(p) => {
+                if self.imitiate_only(p, cancel, ctx).await.is_break() {
+                    return ControlFlow::Break(());
+                }
+                (p.period, p.threshold)
+            }
+        };
+
+        let elapsed = start.elapsed();
+        crate::tenant::tasks::warn_when_period_overrun(
+            elapsed,
+            period,
+            BackgroundLoopKind::Eviction,
+        );
+        // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
+        // don't think that is a relevant fear however, and regardless the imitation should be the
+        // most costly part.
+        crate::metrics::EVICTION_ITERATION_DURATION
+            .get_metric_with_label_values(&[
+                &format!("{}", period.as_secs()),
+                &format!("{}", threshold.as_secs()),
+            ])
+            .unwrap()
+            .observe(elapsed.as_secs_f64());
+
+        ControlFlow::Continue(start + period)
    }

    async fn eviction_iteration_threshold(
@@ -167,30 +180,6 @@ impl Timeline {
            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
        };

-        // If we evict layers but keep cached values derived from those layers, then
-        // we face a storm of on-demand downloads after pageserver restart.
-        // The reason is that the restart empties the caches, and so, the values
-        // need to be re-computed by accessing layers, which we evicted while the
-        // caches were filled.
-        //
-        // Solutions here would be one of the following:
-        // 1. Have a persistent cache.
-        // 2. Count every access to a cached value to the access stats of all layers
-        //    that were accessed to compute the value in the first place.
-        // 3. Invalidate the caches at a period of < p.threshold/2, so that the values
-        //    get re-computed from layers, thereby counting towards layer access stats.
-        // 4. Make the eviction task imitate the layer accesses that typically hit caches.
-        //
-        // We follow approach (4) here because in Neon prod deployment:
-        // - page cache is quite small => high churn => low hit rate
-        //   => eviction gets correct access stats
-        // - value-level caches such as logical size & repatition have a high hit rate,
-        //   especially for inactive tenants
-        //   => eviction sees zero accesses for these
-        //   => they cause the on-demand download storm on pageserver restart
-        //
-        // We should probably move to persistent caches in the future, or avoid
-        // having inactive tenants attached to pageserver in the first place.
        match self.imitate_layer_accesses(p, cancel, ctx).await {
            ControlFlow::Break(()) => return ControlFlow::Break(()),
            ControlFlow::Continue(()) => (),
@@ -307,6 +296,52 @@ impl Timeline {
        ControlFlow::Continue(())
    }

+    /// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by
+    /// disk usage based eviction task.
+    async fn imitiate_only(
+        self: &Arc<Self>,
+        p: &EvictionPolicyLayerAccessThreshold,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<()> {
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+        );
+
+        let _permit = tokio::select! {
+            permit = acquire_permit => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
+        };
+
+        self.imitate_layer_accesses(p, cancel, ctx).await
+    }
+
+    /// If we evict layers but keep cached values derived from those layers, then
+    /// we face a storm of on-demand downloads after pageserver restart.
+    /// The reason is that the restart empties the caches, and so, the values
+    /// need to be re-computed by accessing layers, which we evicted while the
+    /// caches were filled.
+    ///
+    /// Solutions here would be one of the following:
+    /// 1. Have a persistent cache.
+    /// 2. Count every access to a cached value to the access stats of all layers
+    ///    that were accessed to compute the value in the first place.
+    /// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
+    ///    get re-computed from layers, thereby counting towards layer access stats.
+    /// 4. Make the eviction task imitate the layer accesses that typically hit caches.
+    ///
+    /// We follow approach (4) here because in Neon prod deployment:
+    /// - page cache is quite small => high churn => low hit rate
+    ///   => eviction gets correct access stats
+    /// - value-level caches such as logical size & repatition have a high hit rate,
+    ///   especially for inactive tenants
+    ///   => eviction sees zero accesses for these
+    ///   => they cause the on-demand download storm on pageserver restart
+    ///
+    /// We should probably move to persistent caches in the future, or avoid
+    /// having inactive tenants attached to pageserver in the first place.
    #[instrument(skip_all)]
    async fn imitate_layer_accesses(
        &self,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -130,7 +130,7 @@ pub(super) struct UploadQueueStopped {
 pub(crate) enum NotInitialized {
    #[error("queue is in state Uninitialized")]
    Uninitialized,
-    #[error("queue is in state Stopping")]
+    #[error("queue is in state Stopped")]
    Stopped,
    #[error("queue is shutting down")]
    ShuttingDown,
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -0,0 +1,38 @@
+//! An utilization metric which is used to decide on which pageserver to put next tenant.
+//!
+//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the
+//! truth.
+
+use anyhow::Context;
+use std::path::Path;
+
+use pageserver_api::models::PageserverUtilization;
+
+pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
+    // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough
+
+    let statvfs = nix::sys::statvfs::statvfs(tenants_path)
+        .map_err(std::io::Error::from)
+        .context("statvfs tenants directory")?;
+
+    let blocksz = statvfs.block_size();
+
+    #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))]
+    let free = statvfs.blocks_available() as u64 * blocksz;
+    let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get();
+    let captured_at = std::time::SystemTime::now();
+
+    let doc = PageserverUtilization {
+        disk_usage_bytes: used,
+        free_space_bytes: free,
+        // lower is better; start with a constant
+        //
+        // note that u64::MAX will be output as i64::MAX as u64, but that should not matter
+        utilization_score: u64::MAX,
+        captured_at,
+    };
+
+    // TODO: make utilization_score into a metric
+
+    Ok(doc)
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -19,14 +19,13 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use std::fs::{self, File};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
-use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
-use utils::fs_ext;

 pub use pageserver_api::models::virtual_file as api;
 pub(crate) mod io_engine;
@@ -404,47 +403,34 @@ impl VirtualFile {
        Ok(vfile)
    }

-    /// Writes a file to the specified `final_path` in a crash safe fasion
+    /// Async version of [`::utils::crashsafe::overwrite`].
    ///
-    /// The file is first written to the specified tmp_path, and in a second
-    /// step, the tmp path is renamed to the final path. As renames are
-    /// atomic, a crash during the write operation will never leave behind a
-    /// partially written file.
-    pub async fn crashsafe_overwrite<B: BoundedBuf>(
-        final_path: &Utf8Path,
-        tmp_path: &Utf8Path,
+    /// # NB:
+    ///
+    /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but,
+    /// it did at an earlier time.
+    /// And it will use this module's [`io_engine`] in the near future, so, leaving it here.
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
        content: B,
    ) -> std::io::Result<()> {
-        let Some(final_path_parent) = final_path.parent() else {
-            return Err(std::io::Error::from_raw_os_error(
-                nix::errno::Errno::EINVAL as i32,
-            ));
-        };
-        std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
-        let mut file = Self::open_with_options(
-            tmp_path,
-            OpenOptions::new()
-                .write(true)
-                // Use `create_new` so that, if we race with ourselves or something else,
-                // we bail out instead of causing damage.
-                .create_new(true),
-        )
-        .await?;
-        let (_content, res) = file.write_all(content).await;
-        res?;
-        file.sync_all().await?;
-        drop(file); // before the rename, that's important!
-                    // renames are atomic
-        std::fs::rename(tmp_path, final_path)?;
-        // Only open final path parent dirfd now, so that this operation only
-        // ever holds one VirtualFile fd at a time.  That's important because
-        // the current `find_victim_slot` impl might pick the same slot for both
-        // VirtualFile., and it eventually does a blocking write lock instead of
-        // try_lock.
-        let final_parent_dirfd =
-            Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
-        final_parent_dirfd.sync_all().await?;
-        Ok(())
+        // TODO: use tokio_epoll_uring if configured as `io_engine`.
+        // See https://github.com/neondatabase/neon/issues/6663
+
+        tokio::task::spawn_blocking(move || {
+            let slice_storage;
+            let content_len = content.bytes_init();
+            let content = if content.bytes_init() > 0 {
+                slice_storage = Some(content.slice(0..content_len));
+                slice_storage.as_deref().expect("just set it to Some()")
+            } else {
+                &[]
+            };
+            utils::crashsafe::overwrite(&final_path, &tmp_path, content)
+        })
+        .await
+        .expect("blocking task is never aborted")
    }

    /// Call File::sync_all() on the underlying File.
@@ -1337,7 +1323,7 @@ mod tests {
        let path = testdir.join("myfile");
        let tmp_path = testdir.join("myfile.tmp");

-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1346,7 +1332,7 @@ mod tests {
        assert!(!tmp_path.exists());
        drop(file);

-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
@@ -1368,7 +1354,7 @@ mod tests {
        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
        assert!(tmp_path.exists());

-        VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
+        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -334,6 +334,12 @@ impl WalIngest {
                    {
                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
                    }
+                    trace!(
+                        "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                        xlog_checkpoint.oldestActiveXid,
+                        self.checkpoint.oldestActiveXid
+                    );
+                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;

                    // Write a new checkpoint key-value pair on every checkpoint record, even
                    // if nothing really changed. Not strictly required, but it seems nice to
@@ -360,6 +366,13 @@ impl WalIngest {
                    }
                }
            }
+            pg_constants::RM_STANDBY_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                if info == pg_constants::XLOG_RUNNING_XACTS {
+                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
+                    self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+                }
+            }
            _x => {
                // TODO: should probably log & fail here instead of blindly
                // doing something without understanding the protocol
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -773,6 +773,42 @@ impl XlLogicalMessage {
    }
 }

+#[repr(C)]
+#[derive(Debug)]
+pub struct XlRunningXacts {
+    pub xcnt: u32,
+    pub subxcnt: u32,
+    pub subxid_overflow: bool,
+    pub next_xid: TransactionId,
+    pub oldest_running_xid: TransactionId,
+    pub latest_completed_xid: TransactionId,
+    pub xids: Vec<TransactionId>,
+}
+
+impl XlRunningXacts {
+    pub fn decode(buf: &mut Bytes) -> XlRunningXacts {
+        let xcnt = buf.get_u32_le();
+        let subxcnt = buf.get_u32_le();
+        let subxid_overflow = buf.get_u32_le() != 0;
+        let next_xid = buf.get_u32_le();
+        let oldest_running_xid = buf.get_u32_le();
+        let latest_completed_xid = buf.get_u32_le();
+        let mut xids = Vec::new();
+        for _ in 0..(xcnt + subxcnt) {
+            xids.push(buf.get_u32_le());
+        }
+        XlRunningXacts {
+            xcnt,
+            subxcnt,
+            subxid_overflow,
+            next_xid,
+            oldest_running_xid,
+            latest_completed_xid,
+            xids,
+        }
+    }
+}
+
 /// Main routine to decode a WAL record and figure out which blocks are modified
 //
 // See xlogrecord.h for details
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"

 EXTRA_CLEAN = \
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -35,16 +35,16 @@
 #include "utils/memutils.h"
 #include "utils/jsonb.h"

+#include "neon_utils.h"
+
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;

+static const char *jwt_token = NULL;
+
 /* GUCs */
 static char *ConsoleURL = NULL;
 static bool ForwardDDL = true;

-/* Curl structures for sending the HTTP requests */
-static CURL *CurlHandle;
-static struct curl_slist *ContentHeader = NULL;
-
 /*
 * CURL docs say that this buffer must exist until we call curl_easy_cleanup
 * (which we never do), so we make this a static
@@ -226,6 +226,8 @@ ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
 static void
 SendDeltasToControlPlane()
 {
+	static CURL		*handle = NULL;
+
 	if (!RootTable.db_table && !RootTable.role_table)
 		return;
 	if (!ConsoleURL)
@@ -236,29 +238,57 @@ SendDeltasToControlPlane()
 	if (!ForwardDDL)
 		return;

-	char	   *message = ConstructDeltaMessage();
-	ErrorString str = {};
+	if (handle == NULL)
+	{
+		struct curl_slist *headers = NULL;

-	curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH");
-	curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader);
-	curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message);
-	curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL);
-	curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
-	curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ );
-	curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str);
-	curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+		headers = curl_slist_append(headers, "Content-Type: application/json");
+		if (headers == NULL)
+		{
+			elog(ERROR, "Failed to set Content-Type header");
+		}
+
+		if (jwt_token)
+		{
+			char		auth_header[8192];
+
+			snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
+			headers = curl_slist_append(headers, auth_header);
+			if (headers == NULL)
+			{
+				elog(ERROR, "Failed to set Authorization header");
+			}
+		}
+
+		handle = alloc_curl_handle();
+
+		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "PATCH");
+		curl_easy_setopt(handle, CURLOPT_HTTPHEADER, headers);
+		curl_easy_setopt(handle, CURLOPT_URL, ConsoleURL);
+		curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+		curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+	}
+
+	char	   *message = ConstructDeltaMessage();
+	ErrorString str;
+
+	str.size = 0;
+
+	curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str);

 	const int	num_retries = 5;
-	int			curl_status;
+	CURLcode	curl_status;

 	for (int i = 0; i < num_retries; i++)
 	{
-		if ((curl_status = curl_easy_perform(CurlHandle)) == 0)
+		if ((curl_status = curl_easy_perform(handle)) == 0)
 			break;
 		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
 		pg_usleep(1000 * 1000);
 	}
-	if (curl_status != 0)
+	if (curl_status != CURLE_OK)
 	{
 		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
 	}
@@ -266,13 +296,11 @@ SendDeltasToControlPlane()
 	{
 		long		response_code;

-		if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
+		if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
 		{
-			bool		error_exists = str.size != 0;
-
 			if (response_code != 200)
 			{
-				if (error_exists)
+				if (str.size != 0)
 				{
 					elog(ERROR,
 						 "Received HTTP code %ld from control plane: %s",
@@ -835,34 +863,10 @@ InitControlPlaneConnector()
 							 NULL,
 							 NULL);

-	const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
-
+	jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
 	if (!jwt_token)
 	{
 		elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated");
 	}

-	if (curl_global_init(CURL_GLOBAL_DEFAULT))
-	{
-		elog(ERROR, "Failed to initialize curl");
-	}
-	if ((CurlHandle = curl_easy_init()) == NULL)
-	{
-		elog(ERROR, "Failed to initialize curl handle");
-	}
-	if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL)
-	{
-		elog(ERROR, "Failed to initialize content header");
-	}
-
-	if (jwt_token)
-	{
-		char		auth_header[8192];
-
-		snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
-		if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL)
-		{
-			elog(ERROR, "Failed to initialize authorization header");
-		}
-	}
 }
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -14,6 +14,8 @@

 #include "utils/guc.h"

+#include "neon_utils.h"
+
 static int	extension_server_port = 0;

 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
@@ -31,15 +33,19 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	CURL	   *curl;
+	static CURL	   *handle = NULL;
+
 	CURLcode	res;
 	char	   *compute_ctl_url;
 	char	   *postdata;
 	bool		ret = false;

-	if ((curl = curl_easy_init()) == NULL)
+	if (handle == NULL)
 	{
-		elog(ERROR, "Failed to initialize curl handle");
+		handle = alloc_curl_handle();
+
+		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ );
 	}

 	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
@@ -47,28 +53,22 @@ neon_download_extension_file_http(const char *filename, bool is_library)

 	elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);

-	curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-	curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
+	curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url);

-	if (curl)
+	/* Perform the request, res will get the return code */
+	res = curl_easy_perform(handle);
+	/* Check for errors */
+	if (res == CURLE_OK)
 	{
-		/* Perform the request, res will get the return code */
-		res = curl_easy_perform(curl);
-		/* Check for errors */
-		if (res == CURLE_OK)
-		{
-			ret = true;
-		}
-		else
-		{
-			/* Don't error here because postgres will try to find the file */
-			/* and will fail with some proper error message if it's not found. */
-			elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-		}
-
-		/* always cleanup */
-		curl_easy_cleanup(curl);
+		ret = true;
+	}
+	else
+	{
+		/*
+		 * Don't error here because postgres will try to find the file and will
+		 * fail with some proper error message if it's not found.
+		 */
+		elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
 	}

 	return ret;
--- a/pgxn/neon/neon--1.1--1.2.sql
+++ b/pgxn/neon/neon--1.1--1.2.sql
@@ -0,0 +1,29 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.2'" to load this file. \quit
+
+-- Create a convenient view similar to pg_stat_database
+-- that exposes all lfc stat values in one row.
+CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS 
+   WITH lfc_stats AS (
+   SELECT 
+     stat_name, 
+     count
+   FROM neon_get_lfc_stats() AS t(stat_name text, count bigint)
+   ),
+   lfc_values AS (
+   SELECT 
+     MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses,
+     MAX(CASE WHEN stat_name = 'file_cache_hits'   THEN count ELSE NULL END) AS file_cache_hits,
+     MAX(CASE WHEN stat_name = 'file_cache_used'   THEN count ELSE NULL END) AS file_cache_used,
+     MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes,
+     -- Calculate the file_cache_hit_ratio within the same CTE for simplicity
+     CASE 
+        WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL
+        ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL / 
+        (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2)
+     END AS file_cache_hit_ratio
+   FROM lfc_stats
+   )
+SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values;
+
+-- externalize the view to all users in role pg_monitor
+GRANT SELECT ON NEON_STAT_FILE_CACHE TO PG_MONITOR;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -37,7 +37,8 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);

-static int	logical_replication_max_time_lag = 3600;
+static int	logical_replication_max_snap_files = 300;
+bool primary_is_running = false;

 static void
 InitLogicalReplicationMonitor(void)
@@ -45,14 +46,14 @@ InitLogicalReplicationMonitor(void)
 	BackgroundWorker bgw;

 	DefineCustomIntVariable(
-		"neon.logical_replication_max_time_lag",
-		"Threshold for dropping unused logical replication slots",
-		NULL,
-		&logical_replication_max_time_lag,
-		3600, 0, INT_MAX,
-		PGC_SIGHUP,
-		GUC_UNIT_S,
-		NULL, NULL, NULL);
+							"neon.logical_replication_max_snap_files",
+							"Maximum allowed logical replication .snap files",
+							NULL,
+							&logical_replication_max_snap_files,
+							300, 0, INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);

 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
@@ -68,22 +69,99 @@ InitLogicalReplicationMonitor(void)
 	RegisterBackgroundWorker(&bgw);
 }

-typedef struct
+static int
+LsnDescComparator(const void *a, const void *b)
 {
-	NameData    name;
-	bool        dropped;
-	XLogRecPtr  confirmed_flush_lsn;
-	TimestampTz last_updated;
-} SlotStatus;
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return 1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
+ * next gc would leave not more than logical_replication_max_snap_files; all
+ * slots having lower restart_lsn should be dropped.
+ */
+static XLogRecPtr
+get_num_snap_files_lsn_threshold(void)
+{
+	DIR		   *dirdesc;
+	struct dirent *de;
+	char	   *snap_path = "pg_logical/snapshots/";
+	int			cnt = 0;
+	int			lsns_allocated = 1024;
+	int			lsns_num = 0;
+	XLogRecPtr *lsns;
+	XLogRecPtr	cutoff;
+
+	if (logical_replication_max_snap_files < 0)
+		return 0;
+
+	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+
+	/* find all .snap files and get their lsns */
+	dirdesc = AllocateDir(snap_path);
+	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	{
+		XLogRecPtr	lsn;
+		uint32		hi;
+		uint32		lo;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
+		if (lsns_allocated == lsns_num)
+		{
+			lsns_allocated *= 2;
+			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+		}
+		lsns[lsns_num++] = lsn;
+	}
+	/* sort by lsn desc */
+	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
+	/* and take cutoff at logical_replication_max_snap_files */
+	if (logical_replication_max_snap_files > lsns_num)
+		cutoff = 0;
+	/* have less files than cutoff */
+	else
+	{
+		cutoff = lsns[logical_replication_max_snap_files - 1];
+		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
+			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+	}
+	pfree(lsns);
+	FreeDir(dirdesc);
+	return cutoff;
+}
+
+#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */

 /*
 * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
+ * need too many .snap files.
 */
 PGDLLEXPORT void
 LogicalSlotsMonitorMain(Datum main_arg)
 {
-	SlotStatus* slots;
-	TimestampTz now, last_checked;
+	TimestampTz now,
+				last_checked;

 	/* Establish signal handlers. */
 	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
@@ -92,75 +170,105 @@ LogicalSlotsMonitorMain(Datum main_arg)

 	BackgroundWorkerUnblockSignals();

-	slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
-	last_checked = GetCurrentTimestamp();
-
 	for (;;)
 	{
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 logical_replication_max_time_lag*1000/2,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
+		XLogRecPtr	cutoff_lsn;

-		now = GetCurrentTimestamp();
-
-		if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
+		/*
+		 * If there are too many .snap files, just drop all logical slots to
+		 * prevent aux files bloat.
+		 */
+		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		if (cutoff_lsn > 0)
 		{
-			int n_active_slots = 0;
-			last_checked = now;
-
-			LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 			for (int i = 0; i < max_replication_slots; i++)
 			{
+				char		slot_name[NAMEDATALEN];
 				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+				XLogRecPtr	restart_lsn;

+				/* find the name */
+				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 				/* Consider only logical repliction slots */
 				if (!s->in_use || !SlotIsLogical(s))
-					continue;
-
-				if (s->active_pid != 0)
 				{
-					n_active_slots += 1;
+					LWLockRelease(ReplicationSlotControlLock);
 					continue;
 				}

-				/* Check if there was some activity with the slot since last check */
-				if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
+				/* do we need to drop it? */
+				SpinLockAcquire(&s->mutex);
+				restart_lsn = s->data.restart_lsn;
+				SpinLockRelease(&s->mutex);
+				if (restart_lsn >= cutoff_lsn)
 				{
-					slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
-					slots[i].last_updated = now;
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
 				}
-				else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
-				{
-					slots[i].name = s->data.name;
-					slots[i].dropped = true;
-				}
-			}
-			LWLockRelease(ReplicationSlotControlLock);

-			/*
-			 * If there are no active subscriptions, then no new snapshots are generated
-			 * and so no need to force slot deletion.
-			 */
-			if (n_active_slots != 0)
-			{
-				for (int i = 0; i < max_replication_slots; i++)
+				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
+				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
+				LWLockRelease(ReplicationSlotControlLock);
+
+				/* now try to drop it, killing owner before if any */
+				for (;;)
 				{
-					if (slots[i].dropped)
+					pid_t		active_pid;
+
+					SpinLockAcquire(&s->mutex);
+					active_pid = s->active_pid;
+					SpinLockRelease(&s->mutex);
+
+					if (active_pid == 0)
 					{
-						elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
-							 (now - slots[i].last_updated)/USECS_PER_SEC);
-						ReplicationSlotDrop(slots[i].name.data, true);
-						slots[i].dropped = false;
+						/*
+						 * Slot is releasted, try to drop it. Though of course
+						 * it could have been reacquired, so drop can ERROR
+						 * out. Similarly it could have been dropped in the
+						 * meanwhile.
+						 *
+						 * In principle we could remove pg_try/pg_catch, that
+						 * would restart the whole bgworker.
+						 */
+						ConditionVariableCancelSleep();
+						PG_TRY();
+						{
+							ReplicationSlotDrop(slot_name, true);
+							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+						}
+						PG_CATCH();
+						{
+							/* log ERROR and reset elog stack */
+							EmitErrorReport();
+							FlushErrorState();
+							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+						}
+						PG_END_TRY();
+						break;
+					}
+					else
+					{
+						/* kill the owner and wait for release */
+						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						(void) kill(active_pid, SIGTERM);
+						/* We shouldn't get stuck, but to be safe add timeout. */
+						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
 					}
 				}
 			}
 		}
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 LS_MONITOR_CHECK_INTERVAL,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
 	}
 }

+
 void
 _PG_init(void)
 {
@@ -181,6 +289,15 @@ _PG_init(void)

 	pg_init_extension_server();

+	DefineCustomBoolVariable(
+		"neon.primary_is_running",
+		"true if the primary was running at replica startup. false otherwise",
+		NULL,
+		&primary_is_running,
+		false,
+		PGC_POSTMASTER,
+		0,
+		NULL, NULL, NULL);
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,5 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.1'
+default_version = '1.2'
 module_pathname = '$libdir/neon'
 relocatable = true
+trusted = true
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -1,6 +1,9 @@
-
 #include <sys/resource.h>

+#ifndef WALPROPOSER_LIB
+#include <curl/curl.h>
+#endif
+
 #include "postgres.h"

 #include "lib/stringinfo.h"
@@ -114,3 +117,48 @@ disable_core_dump()
 		fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno));
 	}
 }
+
+#ifndef WALPROPOSER_LIB
+
+/*
+ * On macOS with a libcurl that has IPv6 support, curl_global_init() calls
+ * SCDynamicStoreCopyProxies(), which makes the program multithreaded. An ideal
+ * place to call curl_global_init() would be _PG_init(), but Neon has to be
+ * added to shared_preload_libraries, which are loaded in the Postmaster
+ * process. The Postmaster is not supposed to become multithreaded at any point
+ * in its lifecycle. Postgres doesn't have any good hook that I know of to
+ * initialize per-backend structures, so we have to check this on any
+ * allocation of a CURL handle.
+ *
+ * Free the allocated CURL handle with curl_easy_cleanup(3).
+ *
+ * https://developer.apple.com/documentation/systemconfiguration/1517088-scdynamicstorecopyproxies
+ */
+CURL *
+alloc_curl_handle(void)
+{
+	static bool curl_initialized = false;
+
+	CURL *handle;
+
+	if (unlikely(!curl_initialized))
+	{
+		/* Protected by mutex internally */
+		if (curl_global_init(CURL_GLOBAL_DEFAULT))
+		{
+			elog(ERROR, "Failed to initialize curl");
+		}
+
+		curl_initialized = true;
+	}
+
+	handle = curl_easy_init();
+	if (handle == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
+
+	return handle;
+}
+
+#endif
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,6 +1,12 @@
 #ifndef __NEON_UTILS_H__
 #define __NEON_UTILS_H__

+#include "lib/stringinfo.h"
+
+#ifndef WALPROPOSER_LIB
+#include <curl/curl.h>
+#endif
+
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
@@ -8,4 +14,10 @@ void		pq_sendint32_le(StringInfo buf, uint32 i);
 void		pq_sendint64_le(StringInfo buf, uint64 i);
 extern void disable_core_dump();

+#ifndef WALPROPOSER_LIB
+
+CURL *		alloc_curl_handle(void);
+
+#endif
+
 #endif							/* __NEON_UTILS_H__ */
--- a/poetry.lock
+++ b/poetry.lock
@@ -158,6 +158,28 @@ files = [
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"

+[[package]]
+name = "anyio"
+version = "4.3.0"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
+    {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
+idna = ">=2.8"
+sniffio = ">=1.1"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
+
+[package.extras]
+doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
+trio = ["trio (>=0.23)"]
+
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -836,43 +858,43 @@ files = [

 [[package]]
 name = "cryptography"
-version = "42.0.2"
+version = "42.0.4"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
-    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
-    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
-    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
-    {file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
-    {file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
-    {file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
-    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
-    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
-    {file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
-    {file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
-    {file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
-    {file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
-    {file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
+    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
+    {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
+    {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
+    {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
+    {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
+    {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
+    {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
+    {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
+    {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
+    {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
+    {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
+    {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
+    {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
+    {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
 ]

 [package.dependencies]
@@ -1073,6 +1095,100 @@ files = [
    {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"},
 ]

+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[[package]]
+name = "h2"
+version = "4.1.0"
+description = "HTTP/2 State-Machine based protocol implementation"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"},
+    {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"},
+]
+
+[package.dependencies]
+hpack = ">=4.0,<5"
+hyperframe = ">=6.0,<7"
+
+[[package]]
+name = "hpack"
+version = "4.0.0"
+description = "Pure-Python HPACK header compression"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"},
+    {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.3"
+description = "A minimal low-level HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"},
+    {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"},
+]
+
+[package.dependencies]
+certifi = "*"
+h11 = ">=0.13,<0.15"
+
+[package.extras]
+asyncio = ["anyio (>=4.0,<5.0)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+trio = ["trio (>=0.22.0,<0.24.0)"]
+
+[[package]]
+name = "httpx"
+version = "0.26.0"
+description = "The next generation HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"},
+    {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"},
+]
+
+[package.dependencies]
+anyio = "*"
+certifi = "*"
+h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""}
+httpcore = "==1.*"
+idna = "*"
+sniffio = "*"
+
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+
+[[package]]
+name = "hyperframe"
+version = "6.0.1"
+description = "HTTP/2 framing layer for Python"
+optional = false
+python-versions = ">=3.6.1"
+files = [
+    {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"},
+    {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"},
+]
+
 [[package]]
 name = "idna"
 version = "3.3"
@@ -1909,6 +2025,20 @@ pytest = [
    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]

+[[package]]
+name = "pytest-repeat"
+version = "0.9.3"
+description = "pytest plugin for repeating tests"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"},
+    {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "pytest-rerunfailures"
 version = "13.0"
@@ -2142,28 +2272,28 @@ pyasn1 = ">=0.1.3"

 [[package]]
 name = "ruff"
-version = "0.1.11"
+version = "0.2.2"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
-    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
-    {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
-    {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
-    {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
-    {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
-    {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
-    {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
+    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
+    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
+    {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
+    {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
+    {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
+    {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
+    {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
+    {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
 ]

 [[package]]
@@ -2225,6 +2355,17 @@ files = [
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]

+[[package]]
+name = "sniffio"
+version = "1.3.0"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
+    {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
+]
+
 [[package]]
 name = "sshpubkeys"
 version = "3.3.1"
@@ -2678,4 +2819,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
+content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -21,7 +21,7 @@ use crate::{
    console,
    error::{ReportableError, UserFacingError},
 };
-use std::io;
+use std::{io, net::IpAddr};
 use thiserror::Error;

 /// Convenience wrapper for the authentication error.
@@ -62,10 +62,11 @@ pub enum AuthErrorImpl {
    Io(#[from] io::Error),

    #[error(
-        "This IP address is not allowed to connect to this endpoint. \
-        Please add it to the allowed list in the Neon console."
+        "This IP address {0} is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console. \
+        Make sure to check for IPv4 or IPv6 addresses."
    )]
-    IpAddressNotAllowed,
+    IpAddressNotAllowed(IpAddr),

    #[error("Too many connections to this endpoint. Please try again later.")]
    TooManyConnections,
@@ -87,8 +88,8 @@ impl AuthError {
        AuthErrorImpl::AuthFailed(user.into()).into()
    }

-    pub fn ip_address_not_allowed() -> Self {
-        AuthErrorImpl::IpAddressNotAllowed.into()
+    pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
+        AuthErrorImpl::IpAddressNotAllowed(ip).into()
    }

    pub fn too_many_connections() -> Self {
@@ -122,7 +123,7 @@ impl UserFacingError for AuthError {
            MalformedPassword(_) => self.to_string(),
            MissingEndpointName => self.to_string(),
            Io(_) => "Internal error".to_string(),
-            IpAddressNotAllowed => self.to_string(),
+            IpAddressNotAllowed(_) => self.to_string(),
            TooManyConnections => self.to_string(),
            UserTimeout(_) => self.to_string(),
        }
@@ -141,7 +142,7 @@ impl ReportableError for AuthError {
            MalformedPassword(_) => crate::error::ErrorKind::User,
            MissingEndpointName => crate::error::ErrorKind::User,
            Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            IpAddressNotAllowed => crate::error::ErrorKind::User,
+            IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
            TooManyConnections => crate::error::ErrorKind::RateLimit,
            UserTimeout(_) => crate::error::ErrorKind::User,
        }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -209,7 +209,7 @@ async fn auth_quirks(

    // check allowed list
    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-        return Err(auth::AuthError::ip_address_not_allowed());
+        return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr));
    }
    let cached_secret = match maybe_secret {
        Some(secret) => secret,
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -171,16 +171,8 @@ async fn task_main(
                    .context("failed to set socket option")?;

                info!(%peer_addr, "serving");
-                let mut ctx =
-                    RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
-                handle_client(
-                    &mut ctx,
-                    dest_suffix,
-                    tls_config,
-                    tls_server_end_point,
-                    socket,
-                )
-                .await
+                let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
            }
            .unwrap_or_else(|e| {
                // Acknowledge that the task has finished with an error.
@@ -248,7 +240,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 }

 async fn handle_client(
-    ctx: &mut RequestMonitoring,
+    mut ctx: RequestMonitoring,
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -168,12 +168,11 @@ impl CancelClosure {
            cancel_token,
        }
    }
-
    /// Cancels the query running on user's compute node.
-    async fn try_cancel_query(self) -> Result<(), CancelError> {
+    pub async fn try_cancel_query(self) -> Result<(), CancelError> {
        let socket = TcpStream::connect(self.socket_addr).await?;
        self.cancel_token.cancel_query_raw(socket, NoTls).await?;
-
+        info!("query was cancelled");
        Ok(())
    }
 }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -87,6 +87,22 @@ pub mod errors {
    impl ReportableError for ApiError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
+                ApiError::Console {
+                    status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                    ..
+                } => crate::error::ErrorKind::User,
+                ApiError::Console {
+                    status: http::StatusCode::LOCKED,
+                    text,
+                } if text.contains("quota exceeded")
+                    || text.contains("the limit for current plan reached") =>
+                {
+                    crate::error::ErrorKind::User
+                }
+                ApiError::Console {
+                    status: http::StatusCode::TOO_MANY_REQUESTS,
+                    ..
+                } => crate::error::ErrorKind::ServiceRateLimit,
                ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
            }
@@ -222,7 +238,7 @@ pub mod errors {
            match self {
                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                WakeComputeError::ApiError(e) => e.get_error_kind(),
-                WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
+                WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
            }
        }
    }
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -147,15 +147,13 @@ impl RequestMonitoring {
        self.success = true;
    }

-    pub fn log(&mut self) {
+    pub fn log(self) {}
+}
+
+impl Drop for RequestMonitoring {
+    fn drop(&mut self) {
        if let Some(tx) = self.sender.take() {
            let _: Result<(), _> = tx.send(self.clone());
        }
    }
 }
-
-impl Drop for RequestMonitoring {
-    fn drop(&mut self) {
-        self.log()
-    }
-}
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -37,9 +37,12 @@ pub enum ErrorKind {
    /// Network error between user and proxy. Not necessarily user error
    ClientDisconnect,

-    /// Proxy self-imposed rate limits
+    /// Proxy self-imposed user rate limits
    RateLimit,

+    /// Proxy self-imposed service-wise rate limits
+    ServiceRateLimit,
+
    /// internal errors
    Service,

@@ -54,25 +57,12 @@ pub enum ErrorKind {
 }

 impl ErrorKind {
-    pub fn to_str(&self) -> &'static str {
-        match self {
-            ErrorKind::User => "request failed due to user error",
-            ErrorKind::ClientDisconnect => "client disconnected",
-            ErrorKind::RateLimit => "request cancelled due to rate limit",
-            ErrorKind::Service => "internal service error",
-            ErrorKind::ControlPlane => "non-retryable control plane error",
-            ErrorKind::Postgres => "postgres error",
-            ErrorKind::Compute => {
-                "non-retryable compute connection error (or exhausted retry capacity)"
-            }
-        }
-    }
-
    pub fn to_metric_label(&self) -> &'static str {
        match self {
            ErrorKind::User => "user",
            ErrorKind::ClientDisconnect => "clientdisconnect",
            ErrorKind::RateLimit => "ratelimit",
+            ErrorKind::ServiceRateLimit => "serviceratelimit",
            ErrorKind::Service => "service",
            ErrorKind::ControlPlane => "controlplane",
            ErrorKind::Postgres => "postgres",
@@ -85,12 +75,6 @@ pub trait ReportableError: fmt::Display + Send + 'static {
    fn get_error_kind(&self) -> ErrorKind;
 }

-impl ReportableError for tokio::time::error::Elapsed {
-    fn get_error_kind(&self) -> ErrorKind {
-        ErrorKind::RateLimit
-    }
-}
-
 impl ReportableError for tokio_postgres::error::Error {
    fn get_error_kind(&self) -> ErrorKind {
        if self.as_db_error().is_some() {
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -1,4 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+use tracing::info;

 use std::future::poll_fn;
 use std::io;
@@ -39,42 +40,51 @@ where
    }
 }

-pub(super) async fn copy_bidirectional<A, B>(
-    a: &mut A,
-    b: &mut B,
+#[tracing::instrument(skip_all)]
+pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
+    client: &mut Client,
+    compute: &mut Compute,
 ) -> Result<(u64, u64), std::io::Error>
 where
-    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
-    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    Client: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    Compute: AsyncRead + AsyncWrite + Unpin + ?Sized,
 {
-    let mut a_to_b = TransferState::Running(CopyBuffer::new());
-    let mut b_to_a = TransferState::Running(CopyBuffer::new());
+    let mut client_to_compute = TransferState::Running(CopyBuffer::new());
+    let mut compute_to_client = TransferState::Running(CopyBuffer::new());

    poll_fn(|cx| {
-        let mut a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
-        let mut b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+        let mut client_to_compute_result =
+            transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
+        let mut compute_to_client_result =
+            transfer_one_direction(cx, &mut compute_to_client, compute, client)?;

-        // Early termination checks
-        if let TransferState::Done(_) = a_to_b {
-            if let TransferState::Running(buf) = &b_to_a {
+        // Early termination checks from compute to client.
+        if let TransferState::Done(_) = compute_to_client {
+            if let TransferState::Running(buf) = &client_to_compute {
+                info!("Compute is done, terminate client");
                // Initiate shutdown
-                b_to_a = TransferState::ShuttingDown(buf.amt);
-                b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+                client_to_compute = TransferState::ShuttingDown(buf.amt);
+                client_to_compute_result =
+                    transfer_one_direction(cx, &mut client_to_compute, client, compute)?;
            }
        }
-        if let TransferState::Done(_) = b_to_a {
-            if let TransferState::Running(buf) = &a_to_b {
+
+        // Early termination checks from compute to client.
+        if let TransferState::Done(_) = client_to_compute {
+            if let TransferState::Running(buf) = &compute_to_client {
+                info!("Client is done, terminate compute");
                // Initiate shutdown
-                a_to_b = TransferState::ShuttingDown(buf.amt);
-                a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+                compute_to_client = TransferState::ShuttingDown(buf.amt);
+                compute_to_client_result =
+                    transfer_one_direction(cx, &mut compute_to_client, client, compute)?;
            }
        }

        // It is not a problem if ready! returns early ... (comment remains the same)
-        let a_to_b = ready!(a_to_b_result);
-        let b_to_a = ready!(b_to_a_result);
+        let client_to_compute = ready!(client_to_compute_result);
+        let compute_to_client = ready!(compute_to_client_result);

-        Poll::Ready(Ok((a_to_b, b_to_a)))
+        Poll::Ready(Ok((client_to_compute, compute_to_client)))
    })
    .await
 }
@@ -219,38 +229,46 @@ mod tests {
    use tokio::io::AsyncWriteExt;

    #[tokio::test]
-    async fn test_early_termination_a_to_d() {
-        let (mut a_mock, mut b_mock) = tokio::io::duplex(8); // Create a mock duplex stream
-        let (mut c_mock, mut d_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+    async fn test_client_to_compute() {
+        let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream
+        let (mut compute_proxy, mut compute_client) = tokio::io::duplex(32); // Create a mock duplex stream

        // Simulate 'a' finishing while there's still data for 'b'
-        a_mock.write_all(b"hello").await.unwrap();
-        a_mock.shutdown().await.unwrap();
-        d_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+        client_client.write_all(b"hello").await.unwrap();
+        client_client.shutdown().await.unwrap();
+        compute_client.write_all(b"Neon").await.unwrap();
+        compute_client.shutdown().await.unwrap();

-        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+        let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy)
+            .await
+            .unwrap();

        // Assert correct transferred amounts
-        let (a_to_d_count, d_to_a_count) = result;
-        assert_eq!(a_to_d_count, 5); // 'hello' was transferred
-        assert!(d_to_a_count <= 8); // response only partially transferred or not at all
+        let (client_to_compute_count, compute_to_client_count) = result;
+        assert_eq!(client_to_compute_count, 5); // 'hello' was transferred
+        assert_eq!(compute_to_client_count, 4); // response only partially transferred or not at all
    }

    #[tokio::test]
-    async fn test_early_termination_d_to_a() {
-        let (mut a_mock, mut b_mock) = tokio::io::duplex(32); // Create a mock duplex stream
-        let (mut c_mock, mut d_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+    async fn test_compute_to_client() {
+        let (mut client_client, mut client_proxy) = tokio::io::duplex(32); // Create a mock duplex stream
+        let (mut compute_proxy, mut compute_client) = tokio::io::duplex(8); // Create a mock duplex stream

        // Simulate 'a' finishing while there's still data for 'b'
-        d_mock.write_all(b"hello").await.unwrap();
-        d_mock.shutdown().await.unwrap();
-        a_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+        compute_client.write_all(b"hello").await.unwrap();
+        compute_client.shutdown().await.unwrap();
+        client_client
+            .write_all(b"Neon Serverless Postgres")
+            .await
+            .unwrap();

-        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+        let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy)
+            .await
+            .unwrap();

        // Assert correct transferred amounts
-        let (a_to_d_count, d_to_a_count) = result;
-        assert_eq!(d_to_a_count, 5); // 'hello' was transferred
-        assert!(a_to_d_count <= 8); // response only partially transferred or not at all
+        let (client_to_compute_count, compute_to_client_count) = result;
+        assert_eq!(compute_to_client_count, 5); // 'hello' was transferred
+        assert!(client_to_compute_count <= 8); // response only partially transferred or not at all
    }
 }
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -46,7 +46,11 @@ pub async fn proxy_pass(

    // Starting from here we only proxy the client's traffic.
    info!("performing the proxy pass...");
-    let _ = crate::proxy::copy_bidirectional::copy_bidirectional(&mut client, &mut compute).await?;
+    let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute(
+        &mut client,
+        &mut compute,
+    )
+    .await?;

    Ok(())
 }
@@ -63,6 +67,8 @@ pub struct ProxyPassthrough<S> {

 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
    pub async fn proxy_pass(self) -> anyhow::Result<()> {
-        proxy_pass(self.client, self.compute.stream, self.aux).await
+        let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
+        self.compute.cancel_closure.try_cancel_query().await?;
+        res
    }
 }
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -88,7 +88,10 @@ pub async fn task_main(
            return Ok(());
        }
    };
-    let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();
+    let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config());
+    // prefer http2, but support http/1.1
+    tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()];
+    let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into();

    let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
    let _ = addr_incoming.set_nodelay(true);
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -32,7 +32,7 @@ impl PoolingBackend {
        let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
        if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
-            return Err(AuthError::ip_address_not_allowed());
+            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr));
        }
        let cached_secret = match maybe_secret {
            Some(secret) => secret,
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -12,7 +12,7 @@ use hyper::StatusCode;
 use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Value;
-use tokio::join;
+use tokio::try_join;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::GenericClient;
@@ -32,11 +32,9 @@ use crate::auth::ComputeUserInfoParseError;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
-use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
-use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;

@@ -166,9 +164,12 @@ fn get_conn_info(
    let mut options = Option::None;

    for (key, value) in pairs {
-        if key == "options" {
-            options = Some(NeonOptions::parse_options_raw(&value));
-            break;
+        match &*key {
+            "options" => {
+                options = Some(NeonOptions::parse_options_raw(&value));
+            }
+            "application_name" => ctx.set_application(Some(value.into())),
+            _ => {}
        }
    }

@@ -284,8 +285,10 @@ pub async fn handle(
                )?
            }
        },
-        Err(e) => {
-            ctx.set_error_kind(e.get_error_kind());
+        Err(_) => {
+            // TODO: when http error classification is done, distinguish between
+            // timeout on sql vs timeout in proxy/cplane
+            // ctx.set_error_kind(crate::error::ErrorKind::RateLimit);

            let message = format!(
                "HTTP-Connection timed out, execution time exeeded {} seconds",
@@ -399,16 +402,11 @@ async fn handle_inner(
        // not strictly necessary to mark success here,
        // but it's just insurance for if we forget it somewhere else
        ctx.latency_timer.success();
-        Ok::<_, HttpConnError>(client)
+        Ok::<_, anyhow::Error>(client)
    };

    // Run both operations in parallel
-    let (payload_result, auth_and_connect_result) =
-        join!(fetch_and_process_request, authenticate_and_connect,);
-
-    // Handle the results
-    let payload = payload_result?; // Handle errors appropriately
-    let mut client = auth_and_connect_result?; // Handle errors appropriately
+    let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;

    let mut response = Response::builder()
        .status(StatusCode::OK)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,17 +38,22 @@ pytest-rerunfailures = "^13.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
+httpx = {extras = ["http2"], version = "^0.26.0"}
+pytest-repeat = "^0.9.3"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
-ruff = "^0.1.11"
+ruff = "^0.2.2"

 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"

 [tool.mypy]
-exclude = "^vendor/"
+exclude = [
+    "^vendor/",
+    "^target/",
+]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
@@ -72,7 +77,13 @@ ignore_missing_imports = true

 [tool.ruff]
 target-version = "py39"
-extend-exclude = ["vendor/"]
+extend-exclude = [
+    "vendor/",
+    "target/",
+]
+line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
+
+[tool.ruff.lint]
 ignore = [
    "E501", # Line too long, we don't want to be too strict about it
 ]
@@ -83,4 +94,3 @@ select = [
    "W", # pycodestyle
    "B", # bugbear
 ]
-line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -166,6 +166,10 @@ struct Args {
    /// useful for debugging.
    #[arg(long)]
    current_thread_runtime: bool,
+    /// Keep horizon for walsenders, i.e. don't remove WAL segments that are
+    /// still needed for existing replication connection.
+    #[arg(long)]
+    walsenders_keep_horizon: bool,
 }

 // Like PathBufValueParser, but allows empty string.
@@ -295,6 +299,7 @@ async fn main() -> anyhow::Result<()> {
        pg_tenant_only_auth,
        http_auth,
        current_thread_runtime: args.current_thread_runtime,
+        walsenders_keep_horizon: args.walsenders_keep_horizon,
    };

    // initialize sentry if SENTRY_DSN is provided
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -78,6 +78,7 @@ pub struct SafeKeeperConf {
    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
    pub http_auth: Option<Arc<SwappableJwtAuth>>,
    pub current_thread_runtime: bool,
+    pub walsenders_keep_horizon: bool,
 }

 impl SafeKeeperConf {
@@ -121,6 +122,7 @@ impl SafeKeeperConf {
            heartbeat_timeout: Duration::new(5, 0),
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
+            walsenders_keep_horizon: false,
        }
    }
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -695,9 +695,11 @@ impl Collector for TimelineCollector {

        // report total number of timelines
        self.timelines_count.set(timelines_count as i64);
+        mfs.extend(self.timelines_count.collect());
+
        self.active_timelines_count
            .set(active_timelines_count as i64);
-        mfs.extend(self.timelines_count.collect());
+        mfs.extend(self.active_timelines_count.collect());

        mfs
    }
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -4,7 +4,7 @@ use anyhow::{bail, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};

-use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE};
+use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
@@ -946,28 +946,12 @@ where
        }
        Ok(())
    }
-
-    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
-    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
-    /// offloading.
-    /// While it is safe to use inmem values for determining horizon,
-    /// we use persistent to make possible normal states less surprising.
-    pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo {
-        let mut horizon_lsn = min(
-            self.state.remote_consistent_lsn,
-            self.state.peer_horizon_lsn,
-        );
-        if wal_backup_enabled {
-            horizon_lsn = min(horizon_lsn, self.state.backup_lsn);
-        }
-        horizon_lsn.segment_number(self.state.server.wal_seg_size as usize)
-    }
 }

 #[cfg(test)]
 mod tests {
    use futures::future::BoxFuture;
-    use postgres_ffi::WAL_SEGMENT_SIZE;
+    use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};

    use super::*;
    use crate::{
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -136,6 +136,21 @@ impl WalSenders {
        self.mutex.lock().slots.iter().flatten().cloned().collect()
    }

+    /// Get LSN of the most lagging pageserver receiver. Return None if there are no
+    /// active walsenders.
+    pub fn laggard_lsn(self: &Arc<WalSenders>) -> Option<Lsn> {
+        self.mutex
+            .lock()
+            .slots
+            .iter()
+            .flatten()
+            .filter_map(|s| match s.feedback {
+                ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
+                ReplicationFeedback::Standby(_) => None,
+            })
+            .min()
+    }
+
    /// Get aggregated pageserver feedback.
    pub fn get_ps_feedback(self: &Arc<WalSenders>) -> PageserverFeedback {
        self.mutex.lock().agg_ps_feedback
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -286,6 +286,29 @@ impl SharedState {
            .cloned()
            .collect()
    }
+
+    /// Get oldest segno we still need to keep. We hold WAL till it is consumed
+    /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3
+    /// offloading.
+    /// While it is safe to use inmem values for determining horizon,
+    /// we use persistent to make possible normal states less surprising.
+    fn get_horizon_segno(
+        &self,
+        wal_backup_enabled: bool,
+        extra_horizon_lsn: Option<Lsn>,
+    ) -> XLogSegNo {
+        let state = &self.sk.state;
+
+        use std::cmp::min;
+        let mut horizon_lsn = min(state.remote_consistent_lsn, state.peer_horizon_lsn);
+        if wal_backup_enabled {
+            horizon_lsn = min(horizon_lsn, state.backup_lsn);
+        }
+        if let Some(extra_horizon_lsn) = extra_horizon_lsn {
+            horizon_lsn = min(horizon_lsn, extra_horizon_lsn);
+        }
+        horizon_lsn.segment_number(state.server.wal_seg_size as usize)
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -353,6 +376,12 @@ pub struct Timeline {

    /// Directory where timeline state is stored.
    pub timeline_dir: Utf8PathBuf,
+
+    /// Should we keep WAL on disk for active replication connections.
+    /// Especially useful for sharding, when different shards process WAL
+    /// with different speed.
+    // TODO: add `Arc<SafeKeeperConf>` here instead of adding each field separately.
+    walsenders_keep_horizon: bool,
 }

 impl Timeline {
@@ -386,6 +415,7 @@ impl Timeline {
            cancellation_rx,
            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
        })
    }

@@ -418,6 +448,7 @@ impl Timeline {
            cancellation_rx,
            cancellation_tx,
            timeline_dir: conf.timeline_dir(&ttid),
+            walsenders_keep_horizon: conf.walsenders_keep_horizon,
        })
    }

@@ -817,10 +848,20 @@ impl Timeline {
            bail!(TimelineError::Cancelled(self.ttid));
        }

+        // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon.
+        // This allows to get better read speed for pageservers that are lagging behind,
+        // at the cost of keeping more WAL on disk.
+        let replication_horizon_lsn = if self.walsenders_keep_horizon {
+            self.walsenders.laggard_lsn()
+        } else {
+            None
+        };
+
        let horizon_segno: XLogSegNo;
        let remover = {
            let shared_state = self.write_shared_state().await;
-            horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
+            horizon_segno =
+                shared_state.get_horizon_segno(wal_backup_enabled, replication_horizon_lsn);
            if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
                return Ok(()); // nothing to do
            }
--- a/Show More
+++ b/Show More