migrate script

2026-05-22 15:41:15 +00:00 · 2023-12-25 23:04:34 +03:00
115 changed files with 2048 additions and 6716 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@@ -1,2 +0,0 @@
-[profile.default]
-slow-timeout = "1m"
--- a/.github/workflows/build_and_push_docker_image.yml
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -1,105 +0,0 @@
-name: Build and Push Docker Image
-
-on:
-  workflow_call:
-    inputs:
-      dockerfile-path:
-        required: true
-        type: string
-      image-name:
-        required: true
-        type: string
-    outputs:
-      build-tools-tag:
-        description: "tag generated for build tools"
-        value: ${{ jobs.tag.outputs.build-tools-tag }}
-
-jobs:
-  check-if-build-tools-dockerfile-changed:
-    runs-on: ubuntu-latest
-    outputs:
-      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
-    steps:
-      - name: Check if Dockerfile.buildtools has changed
-        id: dockerfile
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
-            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
-            exit
-          fi
-          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
-          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
-            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
-          fi
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  tag:
-    runs-on: ubuntu-latest
-    needs: [ check-if-build-tools-dockerfile-changed ]
-    outputs:
-      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
-
-    steps:
-      - name: Get buildtools tag
-        env:
-          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
-        run: |
-          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
-            IMAGE_TAG=$GITHUB_RUN_ID
-          else
-            IMAGE_TAG=pinned
-          fi
-
-          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
-        shell: bash
-        id: buildtools-tag
-
-  kaniko:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
-
-  kaniko-arm:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    needs: [ tag, check-if-build-tools-dockerfile-changed ]
-    runs-on: [ self-hosted, dev, arm64 ]
-    container: gcr.io/kaniko-project/executor:v1.7.0-debug
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v1
-
-      - name: Configure ECR login
-        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
-
-      - name: Kaniko build
-        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-  manifest:
-    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
-    name: 'manifest'
-    runs-on: [ self-hosted, dev, x64 ]
-    needs:
-      - tag
-      - kaniko
-      - kaniko-arm
-      - check-if-build-tools-dockerfile-changed
-
-    steps:
-      - name: Create manifest
-        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
-
-      - name: Push manifest
-        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,6 +44,7 @@ jobs:

        exit 1

+
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
@@ -73,19 +74,11 @@ jobs:
        shell: bash
        id: build-tag

-  build-buildtools-image:
-    needs: [ check-permissions ]
-    uses: ./.github/workflows/build_and_push_docker_image.yml
-    with:
-      dockerfile-path: Dockerfile.buildtools
-      image-name: build-tools
-    secrets: inherit
-
  check-codestyle-python:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -115,10 +108,10 @@ jobs:
        run: poetry run mypy .

  check-codestyle-rust:
-    needs: [ check-permissions, build-buildtools-image ]
+    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -182,10 +175,10 @@ jobs:
        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions, tag, build-buildtools-image ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    strategy:
      fail-fast: false
@@ -339,16 +332,16 @@ jobs:
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

-      - name: Run rust tests
+      - name: Run cargo test
        run: |
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -358,7 +351,7 @@ jobs:
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure

      - name: Install rust binaries
        run: |
@@ -415,10 +408,10 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
+    needs: [ check-permissions, build-neon, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
    strategy:
@@ -454,10 +447,10 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
-    needs: [ check-permissions, build-neon, build-buildtools-image ]
+    needs: [ check-permissions, build-neon ]
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      # Default shared memory is 64mb
      options: --init --shm-size=512mb
    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -486,12 +479,12 @@ jobs:
      # while coverage is currently collected for the debug ones

  create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}

    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    steps:
@@ -533,10 +526,11 @@ jobs:
            })

  coverage-report:
-    needs: [ check-permissions, regress-tests, build-buildtools-image ]
+    needs: [ check-permissions, regress-tests ]
+
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    strategy:
      fail-fast: false
@@ -700,7 +694,7 @@ jobs:
            }"

  neon-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
@@ -739,7 +733,6 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -750,7 +743,7 @@ jobs:

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
    container: gcr.io/kaniko-project/executor:v1.9.2-debug
    defaults:
      run:
@@ -785,7 +778,6 @@ jobs:
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-tools
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -796,7 +788,7 @@ jobs:
        run: rm -rf ~/.ecr

  compute-node-image:
-    needs: [ check-permissions, build-buildtools-image, tag ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -844,7 +836,6 @@ jobs:
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                           --build-arg PG_VERSION=${{ matrix.version }}
                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
-                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -218,7 +218,7 @@ jobs:

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
          export REMOTE_STORAGE_S3_REGION=eu-central-1
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
--- a/.github/workflows/update_build_tools_image.yml
+++ b/.github/workflows/update_build_tools_image.yml
@@ -1,130 +0,0 @@
-name: 'Update build tools image tag'
-
-# This workflow it used to update tag of build tools in ECR.
-# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
-
-on:
-  workflow_dispatch:
-    inputs:
-      from-tag:
-        description: 'Source tag'
-        required: true
-        type: string
-      to-tag:
-        description: 'Destination tag'
-        required: true
-        type: string
-        default: 'pinned'
-
-defaults:
-  run:
-    shell: bash -euo pipefail {0}
-
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
-
-permissions: {}
-
-jobs:
-  tag-image:
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-    outputs:
-      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
-      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Get source image digest
-        id: next-digest
-        run: |
-          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
-            exit 1
-          fi
-
-          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
-          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
-
-      - name: Get destination image digest (if already exists)
-        id: prev-digest
-        run: |
-          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
-          if [ -z "${PREV_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
-          else
-            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
-
-            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Tag image
-        run: |
-          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
-
-  rollback-tag-image:
-    needs:  tag-image
-    if: ${{ !success() }}
-
-    runs-on: [ self-hosted, gen3, small ]
-    container: golang:1.19-bullseye
-
-    env:
-      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
-      FROM_TAG: ${{ inputs.from-tag }}
-      TO_TAG: ${{ inputs.to-tag }}
-
-    steps:
-      - name: Install Crane & ECR helper
-        run: |
-          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
-          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
-
-      - name: Configure ECR login
-        run: |
-          mkdir /github/home/.docker/
-          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
-
-      - name: Restore previous tag if needed
-        run: |
-          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
-          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
-
-          if [ -z "${NEXT_DIGEST}" ]; then
-            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
-            exit 0
-          fi
-
-          if [ -z "${PREV_DIGEST}" ]; then
-            # I guess we should delete the tag here/untag the image, but crane does not support it
-            # - https://github.com/google/go-containerregistry/issues/999
-
-            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
-
-            exit 0
-          fi
-
-          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
-          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
-            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
-
-            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
-          else
-            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
-          fi
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,6 @@ __pycache__/
 test_output/
 .vscode
 .idea
-neon.iml
 /.neon
 /integration_tests/.neon

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,17 +70,3 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)

 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
-
-## How do I add the "pinned" tag to an buildtools image?
-We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
-
-You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
-or using GitHub CLI:
-
-```bash
-gh workflow -R neondatabase/neon run update_build_tools_image.yml \
-            -f from-tag=6254913013 \
-            -f to-tag=pinned \
-
-# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
-```
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1161,7 +1161,6 @@ dependencies = [
 "flate2",
 "futures",
 "hyper",
- "nix 0.26.2",
 "notify",
 "num_cpus",
 "opentelemetry",
@@ -1169,10 +1168,8 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest",
- "rust-ini",
 "serde",
 "serde_json",
- "signal-hook",
 "tar",
 "tokio",
 "tokio-postgres",
@@ -1204,26 +1201,6 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"

-[[package]]
-name = "const-random"
-version = "0.1.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
-dependencies = [
- "const-random-macro",
-]
-
-[[package]]
-name = "const-random-macro"
-version = "0.1.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
-dependencies = [
- "getrandom 0.2.11",
- "once_cell",
- "tiny-keccak",
-]
-
 [[package]]
 name = "const_fn"
 version = "0.4.9"
@@ -1456,12 +1433,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "crunchy"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
-
 [[package]]
 name = "crypto-bigint"
 version = "0.4.9"
@@ -1604,15 +1575,6 @@ dependencies = [
 "syn 2.0.32",
 ]

-[[package]]
-name = "dlv-list"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
-dependencies = [
- "const-random",
-]
-
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2144,20 +2106,6 @@ dependencies = [
 "hashbrown 0.13.2",
 ]

-[[package]]
-name = "hdrhistogram"
-version = "7.5.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
-dependencies = [
- "base64 0.21.1",
- "byteorder",
- "crossbeam-channel",
- "flate2",
- "nom",
- "num-traits",
-]
-
 [[package]]
 name = "heapless"
 version = "0.8.0"
@@ -3081,16 +3029,6 @@ dependencies = [
 "tokio-stream",
 ]

-[[package]]
-name = "ordered-multimap"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
-dependencies = [
- "dlv-list",
- "hashbrown 0.14.0",
-]
-
 [[package]]
 name = "os_info"
 version = "3.7.0"
@@ -3119,28 +3057,6 @@ dependencies = [
 "sha2",
 ]

-[[package]]
-name = "pagebench"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap",
- "futures",
- "hdrhistogram",
- "humantime",
- "humantime-serde",
- "pageserver",
- "pageserver_api",
- "pageserver_client",
- "rand 0.8.5",
- "serde",
- "serde_json",
- "tokio",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "pagectl"
 version = "0.1.0"
@@ -4264,16 +4180,6 @@ dependencies = [
 "unicode-ident",
 ]

-[[package]]
-name = "rust-ini"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
-dependencies = [
- "cfg-if",
- "ordered-multimap",
-]
-
 [[package]]
 name = "rustc-demangle"
 version = "0.1.23"
@@ -4405,14 +4311,12 @@ dependencies = [
 "async-stream",
 "aws-config",
 "aws-sdk-s3",
- "aws-smithy-async",
 "bincode",
 "bytes",
 "chrono",
 "clap",
 "crc32c",
 "either",
- "futures",
 "futures-util",
 "hex",
 "histogram",
@@ -4451,7 +4355,6 @@ dependencies = [
 "clap",
 "const_format",
 "crc32c",
- "fail",
 "fs2",
 "futures",
 "git-version",
@@ -5231,15 +5134,6 @@ dependencies = [
 "time-core",
 ]

-[[package]]
-name = "tiny-keccak"
-version = "2.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
-dependencies = [
- "crunchy",
-]
-
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -5883,7 +5777,6 @@ dependencies = [
 "chrono",
 "const_format",
 "criterion",
- "fail",
 "futures",
 "heapless",
 "hex",
@@ -6408,7 +6301,6 @@ dependencies = [
 "futures-io",
 "futures-sink",
 "futures-util",
- "getrandom 0.2.11",
 "hex",
 "hmac",
 "hyper",
@@ -6420,7 +6312,6 @@ dependencies = [
 "num-bigint",
 "num-integer",
 "num-traits",
- "once_cell",
 "prost",
 "rand 0.8.5",
 "regex",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,6 @@ members = [
    "pageserver",
    "pageserver/ctl",
    "pageserver/client",
-    "pageserver/pagebench",
    "proxy",
    "safekeeper",
    "storage_broker",
@@ -80,7 +79,6 @@ futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
 hashlink = "0.8.1"
-hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
--- a/2
+++ b/2
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned

 # Build Postgres
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -1,166 +0,0 @@
-FROM debian:bullseye-slim
-
-# Add nonroot user
-RUN useradd -ms /bin/bash nonroot -b /home
-SHELL ["/bin/bash", "-c"]
-
-# System deps
-RUN set -e \
-    && apt update \
-    && apt install -y \
-        autoconf \
-        automake \
-        bison \
-        build-essential \
-        ca-certificates \
-        cmake \
-        curl \
-        flex \
-        git \
-        gnupg \
-        gzip \
-        jq \
-        libcurl4-openssl-dev \
-        libbz2-dev \
-        libffi-dev \
-        liblzma-dev \
-        libncurses5-dev \
-        libncursesw5-dev \
-        libpq-dev \
-        libreadline-dev \
-        libseccomp-dev \
-        libsqlite3-dev \
-        libssl-dev \
-        libstdc++-10-dev \
-        libtool \
-        libxml2-dev \
-        libxmlsec1-dev \
-        libxxhash-dev \
-        lsof \
-        make \
-        netcat \
-        net-tools \
-        openssh-client \
-        parallel \
-        pkg-config \
-        unzip \
-        wget \
-        xz-utils \
-        zlib1g-dev \
-        zstd \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# protobuf-compiler (protoc)
-ENV PROTOC_VERSION 25.1
-RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
-    && unzip -q protoc.zip -d protoc \
-    && mv protoc/bin/protoc /usr/local/bin/protoc \
-    && mv protoc/include/google /usr/local/include/google \
-    && rm -rf protoc.zip protoc
-
-# LLVM
-ENV LLVM_VERSION=17
-RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
-    && apt update \
-    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
-    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# PostgreSQL 14
-RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
-    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
-    && apt update \
-    && apt install -y postgresql-client-14 \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# AWS CLI
-RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
-    && unzip -q awscliv2.zip \
-    && ./aws/install \
-    && rm awscliv2.zip
-
-# Mold: A Modern Linker
-ENV MOLD_VERSION v2.4.0
-RUN set -e \
-    && git clone https://github.com/rui314/mold.git \
-    && mkdir mold/build \
-    && cd mold/build \
-    && git checkout ${MOLD_VERSION} \
-    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
-    && cmake --build . -j $(nproc) \
-    && cmake --install . \
-    && cd .. \
-    && rm -rf mold
-
-# LCOV
-# Build lcov from a fork:
-# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
-# And patches from us:
-# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
-RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
-    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
-    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
-    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
-    && cd lcov \
-    && make install \
-    && rm -rf ../lcov.tar.gz
-
-# Switch to nonroot user
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Python
-ENV PYTHON_VERSION=3.9.2 \
-    PYENV_ROOT=/home/nonroot/.pyenv \
-    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
-RUN set -e \
-    && cd $HOME \
-    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
-    && chmod +x pyenv-installer \
-    && ./pyenv-installer \
-    && export PYENV_ROOT=/home/nonroot/.pyenv \
-    && export PATH="$PYENV_ROOT/bin:$PATH" \
-    && export PATH="$PYENV_ROOT/shims:$PATH" \
-    && pyenv install ${PYTHON_VERSION} \
-    && pyenv global ${PYTHON_VERSION} \
-    && python --version \
-    && pip install --upgrade pip \
-    && pip --version \
-    && pip install pipenv wheel poetry
-
-# Switch to nonroot user (again)
-USER nonroot:nonroot
-WORKDIR /home/nonroot
-
-# Rust
-# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.74.0
-ENV RUSTUP_HOME="/home/nonroot/.rustup"
-ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
-RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
-	chmod +x rustup-init && \
-	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
-	rm rustup-init && \
-    export PATH="$HOME/.cargo/bin:$PATH" && \
-    . "$HOME/.cargo/env" && \
-    cargo --version && rustup --version && \
-    rustup component add llvm-tools-preview rustfmt clippy && \
-    cargo install --git https://github.com/paritytech/cachepot && \
-    cargo install rustfilt && \
-    cargo install cargo-hakari && \
-    cargo install cargo-deny && \
-    cargo install cargo-hack && \
-    cargo install cargo-nextest && \
-    rm -rf /home/nonroot/.cargo/registry && \
-    rm -rf /home/nonroot/.cargo/git
-ENV RUSTC_WRAPPER=cachepot
-
-# Show versions
-RUN whoami \
-    && python --version \
-    && pip --version \
-    && cargo --version --verbose \
-    && rustup --version --verbose \
-    && rustc --version --verbose \
-    && clang --version
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG

@@ -48,29 +48,7 @@ RUN cd postgres && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
-    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
-    # In vanilla postgres this function is limited to Postgres role superuser.
-    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
-    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
-    # so we do it here.
-    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
-    # the first loop is for pg_stat_statement extension version <= 1.6
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if echo "$old_list" | grep -q -F "$filename"; then \
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-        fi; \
-    done; \
-    # the second loop is for pg_stat_statement extension versions >= 1.7, 
-    # where pg_stat_statement_reset() got 3 additional arguments
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        if ! echo "$old_list" | grep -q -F "$filename"; then \
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-        fi; \
-    done      
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control

 #########################################################################################
 #
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=build-tools
+ARG IMAGE=rust
 ARG TAG=pinned
 ARG BUILD_TAG

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -13,7 +13,6 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
-nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
@@ -21,7 +20,6 @@ postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-signal-hook.workspace = true
 tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
@@ -41,4 +39,3 @@ remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
-rust-ini = "0.20.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,31 +31,25 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r http://pg-ext-s3-gateway \
-//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
-//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
+//!             -r http://pg-ext-s3-gateway
 //! ```
 //!
 use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
-use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
-use signal_hook::consts::{SIGQUIT, SIGTERM};
-use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;

-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -71,13 +65,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
-    thread::spawn(move || {
-        for sig in signals.forever() {
-            handle_exit_signal(sig);
-        }
-    });
-
    let build_tag = option_env!("BUILD_TAG")
        .unwrap_or(BUILD_TAG_DEFAULT)
        .to_string();
@@ -112,9 +99,6 @@ fn main() -> Result<()> {
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

-    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
-    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
-
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -225,8 +209,6 @@ fn main() -> Result<()> {
        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
-        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
-        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
    };
    let compute = Arc::new(compute_node);

@@ -357,7 +339,6 @@ fn main() -> Result<()> {
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
-        PG_PID.store(0, Ordering::SeqCst);
        info!("Postgres exited with code {}, shutting down", ecode);
        exit_code = ecode.code()
    }
@@ -512,41 +493,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("pgbouncer-connstr")
-                .long("pgbouncer-connstr")
-                .default_value(
-                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
-                )
-                .value_name("PGBOUNCER_CONNSTR"),
-        )
-        .arg(
-            Arg::new("pgbouncer-ini-path")
-                .long("pgbouncer-ini-path")
-                // Note: this doesn't match current path for pgbouncer.ini.
-                // Until we fix it, we need to pass the path explicitly
-                // or this will be effectively no-op.
-                .default_value("/etc/pgbouncer.ini")
-                .value_name("PGBOUNCER_INI_PATH"),
-        )
-}
-
-/// When compute_ctl is killed, send also termination signal to sync-safekeepers
-/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
-/// wait for termination which would be easy then.
-fn handle_exit_signal(sig: i32) {
-    info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
-    exit(1);
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -6,10 +6,7 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::atomic::AtomicU32;
-use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
-use std::thread;
 use std::time::Instant;

 use anyhow::{Context, Result};
@@ -36,9 +33,6 @@ use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};

-pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
-pub static PG_PID: AtomicU32 = AtomicU32::new(0);
-
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
    // Url type maintains proper escaping
@@ -70,10 +64,6 @@ pub struct ComputeNode {
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
-    // connection string to pgbouncer to change settings
-    pub pgbouncer_connstr: Option<String>,
-    // path to pgbouncer.ini to change settings
-    pub pgbouncer_ini_path: Option<String>,
 }

 // store some metrics about download size that might impact startup time
@@ -506,7 +496,6 @@ impl ComputeNode {
            .stdout(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");
-        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);

        // `postgres --sync-safekeepers` will print all log output to stderr and
        // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
@@ -514,7 +503,6 @@ impl ComputeNode {
        let sync_output = sync_handle
            .wait_with_output()
            .expect("postgres --sync-safekeepers failed");
-        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);

        if !sync_output.status.success() {
            anyhow::bail!(
@@ -669,7 +657,6 @@ impl ComputeNode {
            })
            .spawn()
            .expect("cannot start postgres process");
-        PG_PID.store(pg.id(), Ordering::SeqCst);

        wait_for_postgres(&mut pg, pgdata_path)?;

@@ -750,31 +737,6 @@ impl ComputeNode {
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
-
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
-            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
-                if let Err(err) = res {
-                    error!("error while tuning pgbouncer: {err:?}");
-                }
-            });
-        }
-
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -829,32 +791,6 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        // tune pgbouncer
-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
-
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .expect("failed to create rt");
-
-            // Spawn a thread to do the tuning,
-            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
-            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
-                if let Err(err) = res {
-                    error!("error while tuning pgbouncer: {err:?}");
-                }
-            });
-        }
-
        info!(
            "start_compute spec.remote_extensions {:?}",
            pspec.spec.remote_extensions
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -9,11 +9,9 @@ use std::process::Child;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
-use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use tokio_postgres::NoTls;
-use tracing::{debug, error, info, instrument};
+use tracing::{debug, instrument};

 use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};

@@ -361,68 +359,3 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {

    Ok(())
 }
-
-/// Update pgbouncer.ini with provided options
-pub fn update_pgbouncer_ini(
-    pgbouncer_config: HashMap<String, String>,
-    pgbouncer_ini_path: &str,
-) -> Result<()> {
-    let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
-    let section = conf.section_mut(Some("pgbouncer")).unwrap();
-
-    for (option_name, value) in pgbouncer_config.iter() {
-        section.insert(option_name, value);
-    }
-
-    conf.write_to_file(pgbouncer_ini_path)?;
-    Ok(())
-}
-
-/// Tune pgbouncer.
-/// 1. Apply new config using pgbouncer admin console
-/// 2. Add new values to pgbouncer.ini to preserve them after restart
-pub async fn tune_pgbouncer(
-    pgbouncer_settings: Option<HashMap<String, String>>,
-    pgbouncer_connstr: &str,
-    pgbouncer_ini_path: Option<String>,
-) -> Result<()> {
-    if let Some(pgbouncer_config) = pgbouncer_settings {
-        // Apply new config
-        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
-        let (client, connection) = connect_result.unwrap();
-        tokio::spawn(async move {
-            if let Err(e) = connection.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-
-        for (option_name, value) in pgbouncer_config.iter() {
-            info!(
-                "Applying pgbouncer setting change: {} = {}",
-                option_name, value
-            );
-            let query = format!("SET {} = {}", option_name, value);
-
-            let result = client.simple_query(&query).await;
-
-            info!("Applying pgbouncer setting change: {}", query);
-            info!("pgbouncer setting change result: {:?}", result);
-
-            if let Err(err) = result {
-                // Don't fail on error, just print it into log
-                error!(
-                    "Failed to apply pgbouncer setting change: {},  {}",
-                    query, err
-                );
-            };
-        }
-
-        // save values to pgbouncer.ini
-        // so that they are preserved after pgbouncer restart
-        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
-            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
-        }
-    }
-
-    Ok(())
-}
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,8 +46,6 @@ use std::time::Duration;

 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
-use nix::sys::signal::kill;
-use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -441,14 +439,11 @@ impl Endpoint {
        Ok(())
    }

-    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
+    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
        // TODO use background_process::stop_process instead
        let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
        let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
        let pid = nix::unistd::Pid::from_raw(pid as i32);
-        if send_sigterm {
-            kill(pid, Signal::SIGTERM).ok();
-        }
        crate::background_process::wait_until_stopped("compute_ctl", pid)?;
        Ok(())
    }
@@ -542,7 +537,6 @@ impl Endpoint {
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
            remote_extensions,
-            pgbouncer_settings: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -738,15 +732,10 @@ impl Endpoint {
            &None,
        )?;

-        // Also wait for the compute_ctl process to die. It might have some
-        // cleanup work to do after postgres stops, like syncing safekeepers,
-        // etc.
+        // Also wait for the compute_ctl process to die. It might have some cleanup
+        // work to do after postgres stops, like syncing safekeepers, etc.
        //
-        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
-        // want this cleanup: tests intentionally do stop when majority of
-        // safekeepers is down, so sync-safekeepers would hang otherwise. This
-        // could be a separate flag though.
-        self.wait_for_compute_ctl_to_exit(destroy)?;
+        self.wait_for_compute_ctl_to_exit()?;
        if destroy {
            println!(
                "Destroying postgres data directory '{}'",
--- a/deny.toml
+++ b/deny.toml
@@ -35,7 +35,6 @@ allow = [
    "Artistic-2.0",
    "BSD-2-Clause",
    "BSD-3-Clause",
-    "CC0-1.0",
    "ISC",
    "MIT",
    "MPL-2.0",
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -73,8 +73,6 @@ pub struct ComputeSpec {

    // information about available remote extensions
    pub remote_extensions: Option<RemoteExtSpec>,
-
-    pub pgbouncer_settings: Option<HashMap<String, String>>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -243,9 +243,5 @@
        "public_extensions": [
          "postgis"
        ]
-      },
-      "pgbouncer_settings": {
-        "default_pool_size": "42",
-        "pool_mode": "session"
      }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -124,9 +124,6 @@ impl KeySpaceAccum {
                if range.start == accum.end {
                    accum.end = range.end;
                } else {
-                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
-                    // a new range here if the skipped region was all keys that don't belong on this shard.
-                    // (https://github.com/neondatabase/neon/issues/6247)
                    assert!(range.start > accum.end);
                    self.ranges.push(accum.clone());
                    *accum = range;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -557,6 +557,19 @@ pub enum DownloadRemoteLayersTaskState {
    ShutDown,
 }

+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -81,10 +81,6 @@ impl TenantShardId {
    pub fn is_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }
-
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
 }

 /// Formatting helper
@@ -422,21 +418,6 @@ impl ShardIdentity {
        }
    }

-    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
-    pub fn is_key_disposable(&self, key: &Key) -> bool {
-        if key_is_shard0(key) {
-            // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A: because the WAL ingestion logic currently ingests some shard 0
-            //    content on all shards, even though it's only read on shard 0.  If we
-            //    dropped it, then subsequent WAL ingest to these keys would encounter
-            //    an error.
-            false
-        } else {
-            !self.is_key_local(key)
-        }
-    }
-
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,12 +35,6 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
-    /// Query handler indicated that client should reconnect
-    #[error("Server requested reconnect")]
-    Reconnect,
-    /// Query named an entity that was not found
-    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
    /// Authentication failure
    #[error("Unauthorized: {0}")]
    Unauthorized(std::borrow::Cow<'static, str>),
@@ -60,9 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -431,11 +425,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                info!("Stopped due to shutdown");
                Ok(())
            }
-            Err(QueryError::Reconnect) => {
-                // Dropping out of this loop implicitly disconnects
-                info!("Stopped due to handler reconnect request");
-                Ok(())
-            }
            Err(QueryError::Disconnected(e)) => {
                info!("Disconnected ({e:#})");
                // Disconnection is not an error: we just use it that way internally to drop
@@ -985,9 +974,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Reconnect => "reconnect".to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
-        QueryError::NotFound(_) => "not found".to_string(),
        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
@@ -1009,15 +996,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::SimulatedConnectionError => {
            error!("query handler for query '{query}' failed due to a simulated connection error")
        }
-        QueryError::Reconnect => {
-            info!("query handler for '{query}' requested client to reconnect")
-        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
-        QueryError::NotFound(reason) => {
-            info!("query handler for '{query}' entity not found: {reason}")
-        }
        QueryError::Unauthorized(e) => {
            warn!("query handler for '{query}' failed with authentication error: {e}");
        }
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -79,8 +79,6 @@ pub const XLOG_XACT_PREPARE: u8 = 0x10;
 pub const XLOG_XACT_ABORT: u8 = 0x20;
 pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
 pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
-pub const XLOG_XACT_ASSIGNMENT: u8 = 0x50;
-pub const XLOG_XACT_INVALIDATIONS: u8 = 0x60;

 // From srlu.h
 pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
@@ -105,6 +103,12 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
 // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
 // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;

+// From pg_control.h and rmgrlist.h
+pub const XLOG_NEXTOID: u8 = 0x30;
+pub const XLOG_SWITCH: u8 = 0x40;
+pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
+pub const XLOG_FPI: u8 = 0xB0;
+
 // From multixact.h
 pub const FIRST_MULTIXACT_ID: u32 = 1;
 pub const MAX_MULTIXACT_ID: u32 = 0xFFFFFFFF;
@@ -132,20 +136,12 @@ pub const MULTIXACT_MEMBERS_PER_PAGE: u16 =
 pub const XLOG_HEAP_INSERT: u8 = 0x00;
 pub const XLOG_HEAP_DELETE: u8 = 0x10;
 pub const XLOG_HEAP_UPDATE: u8 = 0x20;
-pub const XLOG_HEAP_TRUNCATE: u8 = 0x30;
 pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
-pub const XLOG_HEAP_CONFIRM: u8 = 0x50;
 pub const XLOG_HEAP_LOCK: u8 = 0x60;
-pub const XLOG_HEAP_INPLACE: u8 = 0x70;
 pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80;
-pub const XLOG_HEAP2_REWRITE: u8 = 0x00;
-pub const XLOG_HEAP2_PRUNE: u8 = 0x10;
-pub const XLOG_HEAP2_VACUUM: u8 = 0x20;
-pub const XLOG_HEAP2_FREEZE_PAGE: u8 = 0x30;
 pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
 pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
 pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60;
-pub const XLOG_HEAP2_NEW_CID: u8 = 0x70;
 pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01;
 pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
 pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
@@ -168,21 +164,8 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
-pub const RM_BTREE_ID: u8 = 11;
-pub const RM_HASH_ID: u8 = 12;
-pub const RM_GIN_ID: u8 = 13;
-pub const RM_GIST_ID: u8 = 14;
-pub const RM_SEQ_ID: u8 = 15;
-pub const RM_SPGIST_ID: u8 = 16;
-pub const RM_BRIN_ID: u8 = 17;
-pub const RM_COMMIT_TS_ID: u8 = 18;
-pub const RM_REPLORIGIN_ID: u8 = 19;
-pub const RM_GENERIC_ID: u8 = 20;
 pub const RM_LOGICALMSG_ID: u8 = 21;

-// from relmapper.h
-pub const XLOG_RELMAP_UPDATE: u8 = 0x0;
-
 // from neon_rmgr.h
 pub const RM_NEON_ID: u8 = 134;

@@ -232,22 +215,8 @@ pub const INVALID_TRANSACTION_ID: u32 = 0;
 pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
 pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;

-/* pg_control.h */
 pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
-pub const XLOG_NOOP: u8 = 0x20;
-pub const XLOG_NEXTOID: u8 = 0x30;
-pub const XLOG_SWITCH: u8 = 0x40;
-pub const XLOG_BACKUP_END: u8 = 0x50;
-pub const XLOG_PARAMETER_CHANGE: u8 = 0x60;
-pub const XLOG_RESTORE_POINT: u8 = 0x70;
-pub const XLOG_FPW_CHANGE: u8 = 0x80;
-pub const XLOG_END_OF_RECOVERY: u8 = 0x90;
-pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
-pub const XLOG_FPI: u8 = 0xB0;
-/* 0xC0 is used in Postgres 9.5-11 */
-pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0;
-
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -117,8 +117,6 @@ impl AzureBlobStorage {
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

-        let mut etag = None;
-        let mut last_modified = None;
        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
@@ -126,13 +124,6 @@ impl AzureBlobStorage {
        let mut bufs = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
-            let etag_str: &str = part.blob.properties.etag.as_ref();
-            if etag.is_none() {
-                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
-            }
-            if last_modified.is_none() {
-                last_modified = Some(part.blob.properties.last_modified.into());
-            }
            if let Some(blob_meta) = part.blob.metadata {
                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
            }
@@ -145,8 +136,6 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
-            etag,
-            last_modified,
            metadata: Some(StorageMetadata(metadata)),
        })
    }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -14,9 +14,7 @@ mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -209,13 +207,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }

-pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
 pub struct Download {
-    pub download_stream: DownloadStream,
-    /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: Option<SystemTime>,
-    /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Option<String>,
+    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};

 use super::{RemoteStorage, StorageMetadata};

@@ -331,8 +331,6 @@ impl RemoteStorage for LocalFs {
                .map_err(DownloadError::Other)?;
            Ok(Download {
                metadata,
-                last_modified: None,
-                etag: None,
                download_stream: Box::pin(source),
            })
        } else {
@@ -374,17 +372,17 @@ impl RemoteStorage for LocalFs {
                .await
                .map_err(DownloadError::Other)?;

-            let download_stream: DownloadStream = match end_exclusive {
-                Some(end_exclusive) => Box::pin(ReaderStream::new(
-                    source.take(end_exclusive - start_inclusive),
-                )),
-                None => Box::pin(ReaderStream::new(source)),
-            };
-            Ok(Download {
-                metadata,
-                last_modified: None,
-                etag: None,
-                download_stream,
+            Ok(match end_exclusive {
+                Some(end_exclusive) => Download {
+                    metadata,
+                    download_stream: Box::pin(ReaderStream::new(
+                        source.take(end_exclusive - start_inclusive),
+                    )),
+                },
+                None => Download {
+                    metadata,
+                    download_stream: Box::pin(ReaderStream::new(source)),
+                },
            })
        } else {
            Err(DownloadError::NotFound)
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,7 +16,6 @@ use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
    meta::credentials::CredentialsProviderChain,
-    profile::ProfileFileCredentialsProvider,
    provider_config::ProviderConfig,
    retry::{RetryConfigBuilder, RetryMode},
    web_identity_token::WebIdentityTokenCredentialsProvider,
@@ -75,29 +74,20 @@ impl S3Bucket {

        let region = Some(Region::new(aws_config.bucket_region.clone()));

-        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
            // needed to access remote extensions bucket
-            .or_else(
-                "token",
+            .or_else("token", {
+                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
                WebIdentityTokenCredentialsProvider::builder()
                    .configure(&provider_conf)
-                    .build(),
-            )
+                    .build()
+            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };
@@ -231,8 +221,6 @@ impl S3Bucket {
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
-                let etag = object_output.e_tag.clone();
-                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());

                let body = object_output.body;
                let body = ByteStreamAsStream::from(body);
@@ -241,8 +229,6 @@ impl S3Bucket {

                Ok(Download {
                    metadata,
-                    etag,
-                    last_modified,
                    download_stream: Box::pin(body),
                })
            }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,12 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
-# which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
-
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
@@ -22,7 +16,6 @@ chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
-fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
 nix.workspace = true
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -31,9 +31,6 @@ pub enum ApiError {
    #[error("Shutting down")]
    ShuttingDown,

-    #[error("Timeout")]
-    Timeout(Cow<'static, str>),
-
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -70,10 +67,6 @@ impl ApiError {
                err.to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
-            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
-                err.to_string(),
-                StatusCode::REQUEST_TIMEOUT,
-            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -83,8 +83,6 @@ pub mod timeout;

 pub mod sync;

-pub mod failpoint_support;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -366,49 +366,6 @@ impl MonotonicCounter<Lsn> for RecordLsn {
    }
 }

-/// Implements  [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
-///
-/// This is used by the `pagebench` pageserver benchmarking tool.
-pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
-
-impl rand::distributions::uniform::SampleUniform for Lsn {
-    type Sampler = LsnSampler;
-}
-
-impl rand::distributions::uniform::UniformSampler for LsnSampler {
-    type X = Lsn;
-
-    fn new<B1, B2>(low: B1, high: B2) -> Self
-    where
-        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
-        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
-    {
-        Self(
-            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
-                low.borrow().0,
-                high.borrow().0,
-            ),
-        )
-    }
-
-    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
-    where
-        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
-        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
-    {
-        Self(
-            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
-                low.borrow().0,
-                high.borrow().0,
-            ),
-        )
-    }
-
-    fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
-        Lsn(self.0.sample(rng))
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use crate::bin_ser::BeSer;
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -8,12 +8,12 @@ use std::ffi::CString;

 use crate::bindings::uint32;
 use crate::bindings::walproposer_api;
-use crate::bindings::NeonWALReadResult;
 use crate::bindings::PGAsyncReadResult;
 use crate::bindings::PGAsyncWriteResult;
 use crate::bindings::Safekeeper;
 use crate::bindings::Size;
 use crate::bindings::StringInfoData;
+use crate::bindings::TimeLineID;
 use crate::bindings::TimestampTz;
 use crate::bindings::WalProposer;
 use crate::bindings::WalProposerConnStatusType;
@@ -178,11 +178,31 @@ extern "C" fn conn_blocking_write(
    }
 }

-extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
+extern "C" fn recovery_download(
+    sk: *mut Safekeeper,
+    _timeline: TimeLineID,
+    startpos: XLogRecPtr,
+    endpos: XLogRecPtr,
+) -> bool {
    unsafe {
        let callback_data = (*(*(*sk).wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).recovery_download(&mut (*wp), &mut (*sk))
+        (*api).recovery_download(&mut (*sk), startpos, endpos)
+    }
+}
+
+#[allow(clippy::unnecessary_cast)]
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+) {
+    unsafe {
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_read(&mut (*sk), buf, startptr)
    }
 }

@@ -194,28 +214,11 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
    }
 }

-#[allow(clippy::unnecessary_cast)]
-extern "C" fn wal_read(
-    sk: *mut Safekeeper,
-    buf: *mut ::std::os::raw::c_char,
-    startptr: XLogRecPtr,
-    count: Size,
-    _errmsg: *mut *mut ::std::os::raw::c_char,
-) -> NeonWALReadResult {
+extern "C" fn free_event_set(wp: *mut WalProposer) {
    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        // TODO: errmsg is not forwarded
-        (*api).wal_read(&mut (*sk), buf, startptr)
-    }
-}
-
-extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_reader_events(&mut (*sk))
+        (*api).free_event_set(&mut (*wp));
    }
 }

@@ -235,14 +238,6 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
    }
 }

-extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).active_state_update_event_set(&mut (*sk));
-    }
-}
-
 extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
    unsafe {
        let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -251,14 +246,6 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
    }
 }

-extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
-    unsafe {
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).rm_safekeeper_event_set(&mut (*sk));
-    }
-}
-
 extern "C" fn wait_event_set(
    wp: *mut WalProposer,
    timeout: ::std::os::raw::c_long,
@@ -326,6 +313,14 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
    }
 }

+extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).confirm_wal_streamed(&mut (*wp), lsn)
+    }
+}
+
 extern "C" fn log_internal(
    wp: *mut WalProposer,
    level: ::std::os::raw::c_int,
@@ -340,6 +335,14 @@ extern "C" fn log_internal(
    }
 }

+extern "C" fn after_election(wp: *mut WalProposer) {
+    unsafe {
+        let callback_data = (*(*wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).after_election(&mut (*wp))
+    }
+}
+
 #[derive(Debug)]
 pub enum Level {
    Debug5,
@@ -398,20 +401,20 @@ pub(crate) fn create_api() -> walproposer_api {
        conn_async_write: Some(conn_async_write),
        conn_blocking_write: Some(conn_blocking_write),
        recovery_download: Some(recovery_download),
-        wal_reader_allocate: Some(wal_reader_allocate),
        wal_read: Some(wal_read),
-        wal_reader_events: Some(wal_reader_events),
+        wal_reader_allocate: Some(wal_reader_allocate),
+        free_event_set: Some(free_event_set),
        init_event_set: Some(init_event_set),
        update_event_set: Some(update_event_set),
-        active_state_update_event_set: Some(active_state_update_event_set),
        add_safekeeper_event_set: Some(add_safekeeper_event_set),
-        rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
        wait_event_set: Some(wait_event_set),
        strong_random: Some(strong_random),
        get_redo_start_lsn: Some(get_redo_start_lsn),
        finish_sync_safekeepers: Some(finish_sync_safekeepers),
        process_safekeeper_feedback: Some(process_safekeeper_feedback),
+        confirm_wal_streamed: Some(confirm_wal_streamed),
        log_internal: Some(log_internal),
+        after_election: Some(after_election),
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
 use crate::{
    api_bindings::{create_api, take_vec_u8, Level},
    bindings::{
-        NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
-        WalProposerFree, WalProposerStart,
+        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
+        WalProposerStart,
    },
 };

@@ -86,19 +86,19 @@ pub trait ApiImpl {
        todo!()
    }

-    fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
+    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
        todo!()
    }

-    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
        todo!()
    }

-    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
        todo!()
    }

-    fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
+    fn free_event_set(&self, _wp: &mut WalProposer) {
        todo!()
    }

@@ -110,18 +110,10 @@ pub trait ApiImpl {
        todo!()
    }

-    fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
    fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
        todo!()
    }

-    fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
-        todo!()
-    }
-
    fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
        todo!()
    }
@@ -142,6 +134,10 @@ pub trait ApiImpl {
        todo!()
    }

+    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
+        todo!()
+    }
+
    fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
        todo!()
    }
@@ -244,7 +240,6 @@ impl Drop for Wrapper {

 #[cfg(test)]
 mod tests {
-    use core::panic;
    use std::{
        cell::Cell,
        sync::{atomic::AtomicUsize, mpsc::sync_channel},
@@ -252,7 +247,7 @@ mod tests {

    use utils::id::TenantTimelineId;

-    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
+    use crate::{api_bindings::Level, walproposer::Wrapper};

    use super::ApiImpl;

@@ -360,17 +355,12 @@ mod tests {
            true
        }

-        fn recovery_download(
-            &self,
-            _wp: &mut crate::bindings::WalProposer,
-            _sk: &mut crate::bindings::Safekeeper,
-        ) -> bool {
-            true
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
+            println!("wal_reader_allocate")
        }

-        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
-            println!("wal_reader_allocate");
-            crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
+        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
+            println!("free_event_set")
        }

        fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
@@ -393,13 +383,6 @@ mod tests {
            self.wait_events.set(WaitEventsData { sk, event_mask });
        }

-        fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
-            println!(
-                "rm_safekeeper_event_set, sk={:?}",
-                sk as *mut crate::bindings::Safekeeper
-            );
-        }
-
        fn wait_event_set(
            &self,
            _: &mut crate::bindings::WalProposer,
@@ -425,7 +408,7 @@ mod tests {
        }

        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("wp_log[{}] {}", level, msg);
+            println!("walprop_log[{}] {}", level, msg);
        }

        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,7 +13,6 @@ use bytes::{Buf, Bytes};
 use pageserver::{
    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
-use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};

 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -27,9 +26,9 @@ fn redo_scenarios(c: &mut Criterion) {

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
-    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+    let tenant_id = TenantId::generate();

-    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+    let manager = PostgresRedoManager::new(conf, tenant_id);

    let manager = Arc::new(manager);

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -5,8 +5,6 @@ use utils::{
    id::{TenantId, TimelineId},
 };

-pub mod util;
-
 #[derive(Debug)]
 pub struct Client {
    mgmt_api_endpoint: String,
--- a/pageserver/client/src/mgmt_api/util.rs
+++ b/pageserver/client/src/mgmt_api/util.rs
@@ -1,49 +0,0 @@
-//! Helpers to do common higher-level tasks with the [`Client`].
-
-use std::sync::Arc;
-
-use tokio::task::JoinSet;
-use utils::id::{TenantId, TenantTimelineId};
-
-use super::Client;
-
-/// Retrieve a list of all of the pageserver's timelines.
-///
-/// Fails if there are sharded tenants present on the pageserver.
-pub async fn get_pageserver_tenant_timelines_unsharded(
-    api_client: &Arc<Client>,
-) -> anyhow::Result<Vec<TenantTimelineId>> {
-    let mut timelines: Vec<TenantTimelineId> = Vec::new();
-    let mut tenants: Vec<TenantId> = Vec::new();
-    for ti in api_client.list_tenants().await? {
-        if !ti.id.is_unsharded() {
-            anyhow::bail!(
-                "only unsharded tenants are supported at this time: {}",
-                ti.id
-            );
-        }
-        tenants.push(ti.id.tenant_id)
-    }
-    let mut js = JoinSet::new();
-    for tenant_id in tenants {
-        js.spawn({
-            let mgmt_api_client = Arc::clone(api_client);
-            async move {
-                (
-                    tenant_id,
-                    mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
-                )
-            }
-        });
-    }
-    while let Some(res) = js.join_next().await {
-        let (tenant_id, details) = res.unwrap();
-        for timeline_id in details.timelines {
-            timelines.push(TenantTimelineId {
-                tenant_id,
-                timeline_id,
-            });
-        }
-    }
-    Ok(timelines)
-}
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -115,8 +115,15 @@ impl PagestreamClient {

    pub async fn getpage(
        &mut self,
-        req: PagestreamGetPageRequest,
+        key: RelTagBlockNo,
+        lsn: Lsn,
    ) -> anyhow::Result<PagestreamGetPageResponse> {
+        let req = PagestreamGetPageRequest {
+            latest: false,
+            rel: key.rel_tag,
+            blkno: key.block_no,
+            lsn,
+        };
        let req = PagestreamFeMessage::GetPage(req);
        let req: bytes::Bytes = req.serialize();
        // let mut req = tokio_util::io::ReaderStream::new(&req);
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -1,26 +0,0 @@
-[package]
-name = "pagebench"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-anyhow.workspace = true
-clap.workspace = true
-futures.workspace = true
-hdrhistogram.workspace = true
-humantime.workspace = true
-humantime-serde.workspace = true
-rand.workspace = true
-serde.workspace = true
-serde_json.workspace = true
-tracing.workspace = true
-tokio.workspace = true
-
-pageserver = { path = ".." }
-pageserver_client.workspace = true
-pageserver_api.workspace = true
-utils = { path = "../../libs/utils/" }
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,272 +0,0 @@
-use anyhow::Context;
-use pageserver_client::page_service::BasebackupRequest;
-
-use utils::id::TenantTimelineId;
-use utils::lsn::Lsn;
-
-use rand::prelude::*;
-use tokio::sync::Barrier;
-use tokio::task::JoinSet;
-use tracing::{debug, info, instrument};
-
-use std::collections::HashMap;
-use std::num::NonZeroUsize;
-use std::ops::Range;
-use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
-use std::sync::{Arc, Mutex};
-use std::time::Instant;
-
-use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
-use crate::util::{request_stats, tokio_thread_local_stats};
-
-/// basebackup@LatestLSN
-#[derive(clap::Parser)]
-pub(crate) struct Args {
-    #[clap(long, default_value = "http://localhost:9898")]
-    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "localhost:64000")]
-    page_service_host_port: String,
-    #[clap(long)]
-    pageserver_jwt: Option<String>,
-    #[clap(long, default_value = "1")]
-    num_clients: NonZeroUsize,
-    #[clap(long, default_value = "1.0")]
-    gzip_probability: f64,
-    #[clap(long)]
-    runtime: Option<humantime::Duration>,
-    #[clap(long)]
-    limit_to_first_n_targets: Option<usize>,
-    targets: Option<Vec<TenantTimelineId>>,
-}
-
-#[derive(Debug, Default)]
-struct LiveStats {
-    completed_requests: AtomicU64,
-}
-
-impl LiveStats {
-    fn inc(&self) {
-        self.completed_requests.fetch_add(1, Ordering::Relaxed);
-    }
-}
-
-struct Target {
-    timeline: TenantTimelineId,
-    lsn_range: Option<Range<Lsn>>,
-}
-
-#[derive(serde::Serialize)]
-struct Output {
-    total: request_stats::Output,
-}
-
-tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
-
-pub(crate) fn main(args: Args) -> anyhow::Result<()> {
-    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
-        main_impl(args, thread_local_stats)
-    })
-}
-
-async fn main_impl(
-    args: Args,
-    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
-) -> anyhow::Result<()> {
-    let args: &'static Args = Box::leak(Box::new(args));
-
-    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
-        args.mgmt_api_endpoint.clone(),
-        args.pageserver_jwt.as_deref(),
-    ));
-
-    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
-        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
-            limit_to_first_n_targets: args.limit_to_first_n_targets,
-            targets: args.targets.clone(),
-        },
-    )
-    .await?;
-    let mut js = JoinSet::new();
-    for timeline in &timelines {
-        js.spawn({
-            let timeline = *timeline;
-            // FIXME: this triggers initial logical size calculation
-            // https://github.com/neondatabase/neon/issues/6168
-            let info = mgmt_api_client
-                .timeline_info(timeline.tenant_id, timeline.timeline_id)
-                .await
-                .unwrap();
-            async move {
-                anyhow::Ok(Target {
-                    timeline,
-                    // TODO: support lsn_range != latest LSN
-                    lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
-                })
-            }
-        });
-    }
-    let mut all_targets: Vec<Target> = Vec::new();
-    while let Some(res) = js.join_next().await {
-        all_targets.push(res.unwrap().unwrap());
-    }
-
-    let live_stats = Arc::new(LiveStats::default());
-
-    let num_client_tasks = timelines.len();
-    let num_live_stats_dump = 1;
-    let num_work_sender_tasks = 1;
-
-    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
-    ));
-    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
-
-    tokio::spawn({
-        let stats = Arc::clone(&live_stats);
-        let start_work_barrier = Arc::clone(&start_work_barrier);
-        async move {
-            start_work_barrier.wait().await;
-            loop {
-                let start = std::time::Instant::now();
-                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
-                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
-                let elapsed = start.elapsed();
-                info!(
-                    "RPS: {:.0}",
-                    completed_requests as f64 / elapsed.as_secs_f64()
-                );
-            }
-        }
-    });
-
-    let mut work_senders = HashMap::new();
-    let mut tasks = Vec::new();
-    for tl in &timelines {
-        let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
-        work_senders.insert(tl, sender);
-        tasks.push(tokio::spawn(client(
-            args,
-            *tl,
-            Arc::clone(&start_work_barrier),
-            receiver,
-            Arc::clone(&all_work_done_barrier),
-            Arc::clone(&live_stats),
-        )));
-    }
-
-    let work_sender = async move {
-        start_work_barrier.wait().await;
-        loop {
-            let (timeline, work) = {
-                let mut rng = rand::thread_rng();
-                let target = all_targets.choose(&mut rng).unwrap();
-                let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
-                (
-                    target.timeline,
-                    Work {
-                        lsn,
-                        gzip: rng.gen_bool(args.gzip_probability),
-                    },
-                )
-            };
-            let sender = work_senders.get(&timeline).unwrap();
-            // TODO: what if this blocks?
-            sender.send(work).await.ok().unwrap();
-        }
-    };
-
-    if let Some(runtime) = args.runtime {
-        match tokio::time::timeout(runtime.into(), work_sender).await {
-            Ok(()) => unreachable!("work sender never terminates"),
-            Err(_timeout) => {
-                // this implicitly drops the work_senders, making all the clients exit
-            }
-        }
-    } else {
-        work_sender.await;
-        unreachable!("work sender never terminates");
-    }
-
-    for t in tasks {
-        t.await.unwrap();
-    }
-
-    let output = Output {
-        total: {
-            let mut agg_stats = request_stats::Stats::new();
-            for stats in all_thread_local_stats.lock().unwrap().iter() {
-                let stats = stats.lock().unwrap();
-                agg_stats.add(&stats);
-            }
-            agg_stats.output()
-        },
-    };
-
-    let output = serde_json::to_string_pretty(&output).unwrap();
-    println!("{output}");
-
-    anyhow::Ok(())
-}
-
-#[derive(Copy, Clone)]
-struct Work {
-    lsn: Option<Lsn>,
-    gzip: bool,
-}
-
-#[instrument(skip_all)]
-async fn client(
-    args: &'static Args,
-    timeline: TenantTimelineId,
-    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<Work>,
-    all_work_done_barrier: Arc<Barrier>,
-    live_stats: Arc<LiveStats>,
-) {
-    start_work_barrier.wait().await;
-
-    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
-        &args.page_service_host_port,
-        args.pageserver_jwt.as_deref(),
-    ))
-    .await
-    .unwrap();
-
-    while let Some(Work { lsn, gzip }) = work.recv().await {
-        let start = Instant::now();
-        let copy_out_stream = client
-            .basebackup(&BasebackupRequest {
-                tenant_id: timeline.tenant_id,
-                timeline_id: timeline.timeline_id,
-                lsn,
-                gzip,
-            })
-            .await
-            .with_context(|| format!("start basebackup for {timeline}"))
-            .unwrap();
-
-        use futures::StreamExt;
-        let size = Arc::new(AtomicUsize::new(0));
-        copy_out_stream
-            .for_each({
-                |r| {
-                    let size = Arc::clone(&size);
-                    async move {
-                        let size = Arc::clone(&size);
-                        size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
-                    }
-                }
-            })
-            .await;
-        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
-        let elapsed = start.elapsed();
-        live_stats.inc();
-        STATS.with(|stats| {
-            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-        });
-    }
-
-    all_work_done_barrier.wait().await;
-}
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,351 +0,0 @@
-use anyhow::Context;
-use futures::future::join_all;
-use pageserver::pgdatadir_mapping::key_to_rel_block;
-use pageserver::repository;
-use pageserver_api::key::is_rel_block_key;
-use pageserver_api::models::PagestreamGetPageRequest;
-
-use utils::id::TenantTimelineId;
-use utils::lsn::Lsn;
-
-use rand::prelude::*;
-use tokio::sync::Barrier;
-use tokio::task::JoinSet;
-use tracing::{info, instrument};
-
-use std::collections::HashMap;
-use std::future::Future;
-use std::num::NonZeroUsize;
-use std::pin::Pin;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::{Arc, Mutex};
-use std::time::{Duration, Instant};
-
-use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
-use crate::util::{request_stats, tokio_thread_local_stats};
-
-/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
-#[derive(clap::Parser)]
-pub(crate) struct Args {
-    #[clap(long, default_value = "http://localhost:9898")]
-    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
-    page_service_connstring: String,
-    #[clap(long)]
-    pageserver_jwt: Option<String>,
-    #[clap(long, default_value = "1")]
-    num_clients: NonZeroUsize,
-    #[clap(long)]
-    runtime: Option<humantime::Duration>,
-    #[clap(long)]
-    per_target_rate_limit: Option<usize>,
-    /// Probability for sending `latest=true` in the request (uniform distribution).
-    #[clap(long, default_value = "1")]
-    req_latest_probability: f64,
-    #[clap(long)]
-    limit_to_first_n_targets: Option<usize>,
-    targets: Option<Vec<TenantTimelineId>>,
-}
-
-#[derive(Debug, Default)]
-struct LiveStats {
-    completed_requests: AtomicU64,
-}
-
-impl LiveStats {
-    fn inc(&self) {
-        self.completed_requests.fetch_add(1, Ordering::Relaxed);
-    }
-}
-
-#[derive(Clone)]
-struct KeyRange {
-    timeline: TenantTimelineId,
-    timeline_lsn: Lsn,
-    start: i128,
-    end: i128,
-}
-
-impl KeyRange {
-    fn len(&self) -> i128 {
-        self.end - self.start
-    }
-}
-
-#[derive(serde::Serialize)]
-struct Output {
-    total: request_stats::Output,
-}
-
-tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
-
-pub(crate) fn main(args: Args) -> anyhow::Result<()> {
-    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
-        main_impl(args, thread_local_stats)
-    })
-}
-
-async fn main_impl(
-    args: Args,
-    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
-) -> anyhow::Result<()> {
-    let args: &'static Args = Box::leak(Box::new(args));
-
-    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
-        args.mgmt_api_endpoint.clone(),
-        args.pageserver_jwt.as_deref(),
-    ));
-
-    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
-        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
-            limit_to_first_n_targets: args.limit_to_first_n_targets,
-            targets: args.targets.clone(),
-        },
-    )
-    .await?;
-
-    let mut js = JoinSet::new();
-    for timeline in &timelines {
-        js.spawn({
-            let mgmt_api_client = Arc::clone(&mgmt_api_client);
-            let timeline = *timeline;
-            async move {
-                let partitioning = mgmt_api_client
-                    .keyspace(timeline.tenant_id, timeline.timeline_id)
-                    .await?;
-                let lsn = partitioning.at_lsn;
-
-                let ranges = partitioning
-                    .keys
-                    .ranges
-                    .iter()
-                    .filter_map(|r| {
-                        let start = r.start;
-                        let end = r.end;
-                        // filter out non-relblock keys
-                        match (is_rel_block_key(&start), is_rel_block_key(&end)) {
-                            (true, true) => Some(KeyRange {
-                                timeline,
-                                timeline_lsn: lsn,
-                                start: start.to_i128(),
-                                end: end.to_i128(),
-                            }),
-                            (true, false) | (false, true) => {
-                                unimplemented!("split up range")
-                            }
-                            (false, false) => None,
-                        }
-                    })
-                    .collect::<Vec<_>>();
-
-                anyhow::Ok(ranges)
-            }
-        });
-    }
-    let mut all_ranges: Vec<KeyRange> = Vec::new();
-    while let Some(res) = js.join_next().await {
-        all_ranges.extend(res.unwrap().unwrap());
-    }
-
-    let live_stats = Arc::new(LiveStats::default());
-
-    let num_client_tasks = timelines.len();
-    let num_live_stats_dump = 1;
-    let num_work_sender_tasks = 1;
-
-    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
-    ));
-    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
-
-    tokio::spawn({
-        let stats = Arc::clone(&live_stats);
-        let start_work_barrier = Arc::clone(&start_work_barrier);
-        async move {
-            start_work_barrier.wait().await;
-            loop {
-                let start = std::time::Instant::now();
-                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
-                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
-                let elapsed = start.elapsed();
-                info!(
-                    "RPS: {:.0}",
-                    completed_requests as f64 / elapsed.as_secs_f64()
-                );
-            }
-        }
-    });
-
-    let mut work_senders = HashMap::new();
-    let mut tasks = Vec::new();
-    for tl in &timelines {
-        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-        work_senders.insert(tl, sender);
-        tasks.push(tokio::spawn(client(
-            args,
-            *tl,
-            Arc::clone(&start_work_barrier),
-            receiver,
-            Arc::clone(&all_work_done_barrier),
-            Arc::clone(&live_stats),
-        )));
-    }
-
-    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
-        None => Box::pin(async move {
-            let weights = rand::distributions::weighted::WeightedIndex::new(
-                all_ranges.iter().map(|v| v.len()),
-            )
-            .unwrap();
-
-            start_work_barrier.wait().await;
-
-            loop {
-                let (timeline, req) = {
-                    let mut rng = rand::thread_rng();
-                    let r = &all_ranges[weights.sample(&mut rng)];
-                    let key: i128 = rng.gen_range(r.start..r.end);
-                    let key = repository::Key::from_i128(key);
-                    let (rel_tag, block_no) =
-                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                    (
-                        r.timeline,
-                        PagestreamGetPageRequest {
-                            latest: rng.gen_bool(args.req_latest_probability),
-                            lsn: r.timeline_lsn,
-                            rel: rel_tag,
-                            blkno: block_no,
-                        },
-                    )
-                };
-                let sender = work_senders.get(&timeline).unwrap();
-                // TODO: what if this blocks?
-                sender.send(req).await.ok().unwrap();
-            }
-        }),
-        Some(rps_limit) => Box::pin(async move {
-            let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-
-            let make_timeline_task: &dyn Fn(
-                TenantTimelineId,
-            )
-                -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
-                let sender = work_senders.get(&timeline).unwrap();
-                let ranges: Vec<KeyRange> = all_ranges
-                    .iter()
-                    .filter(|r| r.timeline == timeline)
-                    .cloned()
-                    .collect();
-                let weights = rand::distributions::weighted::WeightedIndex::new(
-                    ranges.iter().map(|v| v.len()),
-                )
-                .unwrap();
-
-                Box::pin(async move {
-                    let mut ticker = tokio::time::interval(period);
-                    ticker.set_missed_tick_behavior(
-                        /* TODO review this choice */
-                        tokio::time::MissedTickBehavior::Burst,
-                    );
-                    loop {
-                        ticker.tick().await;
-                        let req = {
-                            let mut rng = rand::thread_rng();
-                            let r = &ranges[weights.sample(&mut rng)];
-                            let key: i128 = rng.gen_range(r.start..r.end);
-                            let key = repository::Key::from_i128(key);
-                            let (rel_tag, block_no) = key_to_rel_block(key)
-                                .expect("we filter non-rel-block keys out above");
-                            PagestreamGetPageRequest {
-                                latest: rng.gen_bool(args.req_latest_probability),
-                                lsn: r.timeline_lsn,
-                                rel: rel_tag,
-                                blkno: block_no,
-                            }
-                        };
-                        sender.send(req).await.ok().unwrap();
-                    }
-                })
-            };
-
-            let tasks: Vec<_> = work_senders
-                .keys()
-                .map(|tl| make_timeline_task(**tl))
-                .collect();
-
-            start_work_barrier.wait().await;
-
-            join_all(tasks).await;
-        }),
-    };
-
-    if let Some(runtime) = args.runtime {
-        match tokio::time::timeout(runtime.into(), work_sender).await {
-            Ok(()) => unreachable!("work sender never terminates"),
-            Err(_timeout) => {
-                // this implicitly drops the work_senders, making all the clients exit
-            }
-        }
-    } else {
-        work_sender.await;
-        unreachable!("work sender never terminates");
-    }
-
-    for t in tasks {
-        t.await.unwrap();
-    }
-
-    let output = Output {
-        total: {
-            let mut agg_stats = request_stats::Stats::new();
-            for stats in all_thread_local_stats.lock().unwrap().iter() {
-                let stats = stats.lock().unwrap();
-                agg_stats.add(&stats);
-            }
-            agg_stats.output()
-        },
-    };
-
-    let output = serde_json::to_string_pretty(&output).unwrap();
-    println!("{output}");
-
-    anyhow::Ok(())
-}
-
-#[instrument(skip_all)]
-async fn client(
-    args: &'static Args,
-    timeline: TenantTimelineId,
-    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
-    all_work_done_barrier: Arc<Barrier>,
-    live_stats: Arc<LiveStats>,
-) {
-    start_work_barrier.wait().await;
-
-    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-        .await
-        .unwrap();
-    let mut client = client
-        .pagestream(timeline.tenant_id, timeline.timeline_id)
-        .await
-        .unwrap();
-
-    while let Some(req) = work.recv().await {
-        let start = Instant::now();
-        client
-            .getpage(req)
-            .await
-            .with_context(|| format!("getpage for {timeline}"))
-            .unwrap();
-        let elapsed = start.elapsed();
-        live_stats.inc();
-        STATS.with(|stats| {
-            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-        });
-    }
-
-    all_work_done_barrier.wait().await;
-}
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -1,85 +0,0 @@
-use std::sync::Arc;
-
-use humantime::Duration;
-use tokio::task::JoinSet;
-use utils::id::TenantTimelineId;
-
-#[derive(clap::Parser)]
-pub(crate) struct Args {
-    #[clap(long, default_value = "http://localhost:9898")]
-    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "localhost:64000")]
-    page_service_host_port: String,
-    #[clap(long)]
-    pageserver_jwt: Option<String>,
-    #[clap(
-        long,
-        help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
-    )]
-    poll_for_completion: Option<Duration>,
-    #[clap(long)]
-    limit_to_first_n_targets: Option<usize>,
-    targets: Option<Vec<TenantTimelineId>>,
-}
-
-pub(crate) fn main(args: Args) -> anyhow::Result<()> {
-    let rt = tokio::runtime::Builder::new_multi_thread()
-        .enable_all()
-        .build()
-        .unwrap();
-
-    let main_task = rt.spawn(main_impl(args));
-    rt.block_on(main_task).unwrap()
-}
-
-async fn main_impl(args: Args) -> anyhow::Result<()> {
-    let args: &'static Args = Box::leak(Box::new(args));
-
-    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
-        args.mgmt_api_endpoint.clone(),
-        args.pageserver_jwt.as_deref(),
-    ));
-
-    // discover targets
-    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
-        &mgmt_api_client,
-        crate::util::cli::targets::Spec {
-            limit_to_first_n_targets: args.limit_to_first_n_targets,
-            targets: args.targets.clone(),
-        },
-    )
-    .await?;
-
-    // kick it off
-
-    let mut js = JoinSet::new();
-    for tl in timelines {
-        let mgmt_api_client = Arc::clone(&mgmt_api_client);
-        js.spawn(async move {
-            // TODO: API to explicitly trigger initial logical size computation.
-            // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
-            // => https://github.com/neondatabase/neon/issues/6168
-            let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id)
-                .await
-                .unwrap();
-
-            if let Some(period) = args.poll_for_completion {
-                let mut ticker = tokio::time::interval(period.into());
-                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
-                let mut info = info;
-                while !info.current_logical_size_is_accurate {
-                    ticker.tick().await;
-                    info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id)
-                        .await
-                        .unwrap();
-                }
-            }
-        });
-    }
-    while let Some(res) = js.join_next().await {
-        let _: () = res.unwrap();
-    }
-    Ok(())
-}
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -1,48 +0,0 @@
-use clap::Parser;
-use utils::logging;
-
-/// Re-usable pieces of code that aren't CLI-specific.
-mod util {
-    pub(crate) mod connstring;
-    pub(crate) mod request_stats;
-    #[macro_use]
-    pub(crate) mod tokio_thread_local_stats;
-    /// Re-usable pieces of CLI-specific code.
-    pub(crate) mod cli {
-        pub(crate) mod targets;
-    }
-}
-
-/// The pagebench CLI sub-commands, dispatched in [`main`] below.
-mod cmd {
-    pub(super) mod basebackup;
-    pub(super) mod getpage_latest_lsn;
-    pub(super) mod trigger_initial_size_calculation;
-}
-
-/// Component-level performance test for pageserver.
-#[derive(clap::Parser)]
-enum Args {
-    Basebackup(cmd::basebackup::Args),
-    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
-    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
-}
-
-fn main() {
-    logging::init(
-        logging::LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-        logging::Output::Stderr,
-    )
-    .unwrap();
-
-    let args = Args::parse();
-    match args {
-        Args::Basebackup(args) => cmd::basebackup::main(args),
-        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
-        Args::TriggerInitialSizeCalculation(args) => {
-            cmd::trigger_initial_size_calculation::main(args)
-        }
-    }
-    .unwrap()
-}
--- a/pageserver/pagebench/src/util/cli/targets.rs
+++ b/pageserver/pagebench/src/util/cli/targets.rs
@@ -1,34 +0,0 @@
-use std::sync::Arc;
-
-use pageserver_client::mgmt_api;
-use tracing::info;
-use utils::id::TenantTimelineId;
-
-pub(crate) struct Spec {
-    pub(crate) limit_to_first_n_targets: Option<usize>,
-    pub(crate) targets: Option<Vec<TenantTimelineId>>,
-}
-
-pub(crate) async fn discover(
-    api_client: &Arc<mgmt_api::Client>,
-    spec: Spec,
-) -> anyhow::Result<Vec<TenantTimelineId>> {
-    let mut timelines = if let Some(targets) = spec.targets {
-        targets
-    } else {
-        mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
-    };
-
-    if let Some(limit) = spec.limit_to_first_n_targets {
-        timelines.sort(); // for determinism
-        timelines.truncate(limit);
-        if timelines.len() < limit {
-            anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
-        }
-    }
-
-    info!("timelines:\n{:?}", timelines);
-    info!("number of timelines:\n{:?}", timelines.len());
-
-    Ok(timelines)
-}
--- a/pageserver/pagebench/src/util/connstring.rs
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -1,8 +0,0 @@
-pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
-    let colon_and_jwt = if let Some(jwt) = jwt {
-        format!(":{jwt}") // TODO: urlescape
-    } else {
-        String::new()
-    };
-    format!("postgres://postgres{colon_and_jwt}@{host_port}")
-}
--- a/pageserver/pagebench/src/util/request_stats.rs
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -1,88 +0,0 @@
-use std::time::Duration;
-
-use anyhow::Context;
-
-pub(crate) struct Stats {
-    latency_histo: hdrhistogram::Histogram<u64>,
-}
-
-impl Stats {
-    pub(crate) fn new() -> Self {
-        Self {
-            // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
-            // which would skew the benchmark results.
-            latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
-        }
-    }
-    pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
-        let micros: u64 = latency
-            .as_micros()
-            .try_into()
-            .context("latency greater than u64")?;
-        self.latency_histo
-            .record(micros)
-            .context("add to histogram")?;
-        Ok(())
-    }
-    pub(crate) fn output(&self) -> Output {
-        let latency_percentiles = std::array::from_fn(|idx| {
-            let micros = self
-                .latency_histo
-                .value_at_percentile(LATENCY_PERCENTILES[idx]);
-            Duration::from_micros(micros)
-        });
-        Output {
-            request_count: self.latency_histo.len(),
-            latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
-            latency_percentiles: LatencyPercentiles {
-                latency_percentiles,
-            },
-        }
-    }
-    pub(crate) fn add(&mut self, other: &Self) {
-        let Self {
-            ref mut latency_histo,
-        } = self;
-        latency_histo.add(&other.latency_histo).unwrap();
-    }
-}
-
-impl Default for Stats {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
-
-struct LatencyPercentiles {
-    latency_percentiles: [Duration; 4],
-}
-
-impl serde::Serialize for LatencyPercentiles {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        use serde::ser::SerializeMap;
-        let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
-        for p in LATENCY_PERCENTILES {
-            ser.serialize_entry(
-                &format!("p{p}"),
-                &format!(
-                    "{}",
-                    &humantime::format_duration(self.latency_percentiles[0])
-                ),
-            )?;
-        }
-        ser.end()
-    }
-}
-
-#[derive(serde::Serialize)]
-pub(crate) struct Output {
-    request_count: u64,
-    #[serde(with = "humantime_serde")]
-    latency_mean: Duration,
-    latency_percentiles: LatencyPercentiles,
-}
--- a/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
+++ b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
@@ -1,45 +0,0 @@
-pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
-pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
-
-macro_rules! declare {
-    ($THREAD_LOCAL_NAME:ident: $T:ty) => {
-        thread_local! {
-            pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
-                std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
-            );
-        }
-    };
-}
-
-use std::sync::{Arc, Mutex};
-
-pub(crate) use declare;
-
-macro_rules! main {
-    ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
-        let main_impl = $main_impl;
-        let all = Arc::new(Mutex::new(Vec::new()));
-
-        let rt = tokio::runtime::Builder::new_multi_thread()
-            .on_thread_start({
-                let all = Arc::clone(&all);
-                move || {
-                    // pre-initialize the thread local stats by accessesing them
-                    // (some stats like requests_stats::Stats are quite costly to initialize,
-                    //  we don't want to pay that cost during the measurement period)
-                    $THREAD_LOCAL_NAME.with(|stats| {
-                        let stats: Arc<_> = Arc::clone(&*stats.borrow());
-                        all.lock().unwrap().push(stats);
-                    });
-                }
-            })
-            .enable_all()
-            .build()
-            .unwrap();
-
-        let main_task = rt.spawn(main_impl(all));
-        rt.block_on(main_task).unwrap()
-    }};
-}
-
-pub(crate) use main;
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,7 +23,6 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
-use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

@@ -175,7 +174,7 @@ where
        ] {
            for segno in self
                .timeline
-                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
+                .list_slru_segments(kind, self.lsn, self.ctx)
                .await?
            {
                self.add_slru_segment(kind, segno).await?;
@@ -193,7 +192,7 @@ where
            // Otherwise only include init forks of unlogged relations.
            let rels = self
                .timeline
-                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
@@ -268,7 +267,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .get_rel_size(src, self.lsn, false, self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -289,7 +288,7 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }
@@ -311,7 +310,7 @@ where
    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
+            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
            .await?;

        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -353,7 +352,7 @@ where
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
-                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                .await?;

            ensure!(
@@ -400,7 +399,7 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                    .await?
                    .is_empty()
            {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,7 +31,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_backend::AuthType;
-use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
@@ -127,7 +126,7 @@ fn main() -> anyhow::Result<()> {
    }

    // Initialize up failpoints support
-    let scenario = failpoint_support::init();
+    let scenario = pageserver::failpoint_support::init();

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -76,8 +76,6 @@ pub mod defaults {

    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;

-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
-
    ///
    /// Default built-in configuration file.
    ///
@@ -90,7 +88,6 @@ pub mod defaults {
 #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
 #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'

-#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}

 # initial superuser role name to use when creating a new tenant
@@ -111,8 +108,6 @@ pub mod defaults {

 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'

-#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
-
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -238,9 +233,6 @@ pub struct PageServerConf {
    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
    /// heatmap uploads vs. other remote storage operations.
    pub heatmap_upload_concurrency: usize,
-
-    /// Maximum number of WAL records to be ingested and committed at the same time
-    pub ingest_batch_size: u64,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -322,8 +314,6 @@ struct PageServerConfigBuilder {
    control_plane_emergency_mode: BuilderValue<bool>,

    heatmap_upload_concurrency: BuilderValue<usize>,
-
-    ingest_batch_size: BuilderValue<u64>,
 }

 impl Default for PageServerConfigBuilder {
@@ -396,8 +386,6 @@ impl Default for PageServerConfigBuilder {
            control_plane_emergency_mode: Set(false),

            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
-
-            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
        }
    }
 }
@@ -546,10 +534,6 @@ impl PageServerConfigBuilder {
        self.heatmap_upload_concurrency = BuilderValue::Set(value)
    }

-    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
-        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
-    }
-
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -648,12 +632,10 @@ impl PageServerConfigBuilder {
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
+
            heatmap_upload_concurrency: self
                .heatmap_upload_concurrency
                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
-            ingest_batch_size: self
-                .ingest_batch_size
-                .ok_or(anyhow!("missing ingest_batch_size"))?,
        })
    }
 }
@@ -896,7 +878,6 @@ impl PageServerConf {
                "heatmap_upload_concurrency" => {
                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                },
-                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -968,7 +949,6 @@ impl PageServerConf {
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
        }
    }
 }
@@ -1197,8 +1177,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1259,8 +1238,7 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
-                ingest_batch_size: 100,
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1490,7 +1468,6 @@ threshold = "20m"
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
            })
        );
        match &conf.default_tenant_conf.eviction_policy {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -74,45 +74,6 @@ pub struct DiskUsageEvictionTaskConfig {
    pub period: Duration,
    #[cfg(feature = "testing")]
    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
-    /// Select sorting for evicted layers
-    #[serde(default)]
-    pub eviction_order: EvictionOrder,
-}
-
-/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
-/// partitioning.
-#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "type", content = "args")]
-pub enum EvictionOrder {
-    /// Order the layers to be evicted by how recently they have been accessed in absolute
-    /// time.
-    ///
-    /// This strategy is unfair when some tenants grow faster than others towards the slower
-    /// growing.
-    #[default]
-    AbsoluteAccessed,
-
-    /// Order the layers to be evicted by how recently they have been accessed relatively within
-    /// the set of resident layers of a tenant.
-    ///
-    /// This strategy will evict layers more fairly but is untested.
-    RelativeAccessed {
-        #[serde(default)]
-        highest_layer_count_loses_first: bool,
-    },
-}
-
-impl EvictionOrder {
-    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
-    /// counts should be the first ones to have their layers evicted.
-    fn highest_layer_count_loses_first(&self) -> bool {
-        match self {
-            EvictionOrder::AbsoluteAccessed => false,
-            EvictionOrder::RelativeAccessed {
-                highest_layer_count_loses_first,
-            } => *highest_layer_count_loses_first,
-        }
-    }
 }

 #[derive(Default)]
@@ -231,14 +192,7 @@ async fn disk_usage_eviction_task_iteration(
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(
-        state,
-        storage,
-        usage_pre,
-        task_config.eviction_order,
-        cancel,
-    )
-    .await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -324,7 +278,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    _storage: &GenericRemoteStorage,
    usage_pre: U,
-    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
    // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -344,7 +297,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
+    let candidates = match collect_eviction_candidates(cancel).await? {
        EvictionCandidates::Cancelled => {
            return Ok(IterationOutcome::Cancelled);
        }
@@ -354,16 +307,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
-        let nth = i + 1;
        let desc = candidate.layer.layer_desc();
-        let total_candidates = candidates.len();
-        let size = desc.file_size;
-        let rel = candidate.relative_last_activity;
        debug!(
-            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
+            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
+            i + 1,
+            candidates.len(),
+            desc.file_size,
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
+            partition,
            desc.tenant_shard_id,
            desc.timeline_id,
            candidate.layer,
@@ -506,7 +459,6 @@ struct EvictionCandidate {
    timeline: Arc<Timeline>,
    layer: Layer,
    last_activity_ts: SystemTime,
-    relative_last_activity: finite_f32::FiniteF32,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -526,24 +478,24 @@ enum EvictionCandidates {
 /// order. A caller that evicts in that order, until pressure is relieved, implements
 /// the eviction policy outlined in the module comment.
 ///
-/// # Example with EvictionOrder::AbsoluteAccessed
+/// # Example
 ///
 /// Imagine that there are two tenants, A and B, with five layers each, a-e.
 /// Each layer has size 100, and both tenant's min_resident_size is 150.
 /// The eviction order would be
 ///
 /// ```text
-/// partition last_activity_ts tenant/layer
-/// Above     18:30            A/c
-/// Above     19:00            A/b
-/// Above     18:29            B/c
-/// Above     19:05            B/b
-/// Above     20:00            B/a
-/// Above     20:03            A/a
-/// Below     20:30            A/d
-/// Below     20:40            B/d
-/// Below     20:45            B/e
-/// Below     20:58            A/e
+/// partition last_activity_ts    tenant/layer
+/// Above     18:30               A/c
+/// Above     19:00               A/b
+/// Above     18:29               B/c
+/// Above     19:05               B/b
+/// Above     20:00               B/a
+/// Above     20:03               A/a
+/// Below     20:30               A/d
+/// Below     20:40               B/d
+/// Below     20:45               B/e
+/// Below     20:58               A/e
 /// ```
 ///
 /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
@@ -553,77 +505,7 @@ enum EvictionCandidates {
 /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
-///
-/// # Example with EvictionOrder::RelativeAccessed
-///
-/// ```text
-/// partition relative_age last_activity_ts tenant/layer
-/// Above     0/4          18:30            A/c
-/// Above     0/4          18:29            B/c
-/// Above     1/4          19:00            A/b
-/// Above     1/4          19:05            B/b
-/// Above     2/4          20:00            B/a
-/// Above     2/4          20:03            A/a
-/// Below     3/4          20:30            A/d
-/// Below     3/4          20:40            B/d
-/// Below     4/4          20:45            B/e
-/// Below     4/4          20:58            A/e
-/// ```
-///
-/// With tenants having the same number of layers the picture does not change much. The same with
-/// A having many more layers **resident** (not all of them listed):
-///
-/// ```text
-/// Above       0/100      18:30            A/c
-/// Above       0/4        18:29            B/c
-/// Above       1/100      19:00            A/b
-/// Above       2/100      20:03            A/a
-/// Above       3/100      20:03            A/nth_3
-/// Above       4/100      20:03            A/nth_4
-///             ...
-/// Above       1/4        19:05            B/b
-/// Above      25/100      20:04            A/nth_25
-///             ...
-/// Above       2/4        20:00            B/a
-/// Above      50/100      20:10            A/nth_50
-///             ...
-/// Below       3/4        20:40            B/d
-/// Below      99/100      20:30            A/nth_99
-/// Below       4/4        20:45            B/e
-/// Below     100/100      20:58            A/nth_100
-/// ```
-///
-/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
-/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
-/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
-/// appeared:
-///
-/// ```text
-/// Above       0/87       20:04            A/nth_23
-/// Above       0/3        19:05            B/b
-/// Above       0/50       20:59            C/nth_0
-/// Above       1/87       20:04            A/nth_24
-/// Above       1/50       21:00            C/nth_1
-/// Above       2/87       20:04            A/nth_25
-///             ...
-/// Above      16/50       21:02            C/nth_16
-/// Above       1/3        20:00            B/a
-/// Above      27/87       20:10            A/nth_50
-///             ...
-/// Below       2/3        20:40            B/d
-/// Below      49/50       21:05            C/nth_49
-/// Below      86/87       20:30            A/nth_99
-/// Below       3/3        20:45            B/e
-/// Below      50/50       21:05            C/nth_50
-/// Below      87/87       20:58            A/nth_100
-/// ```
-///
-/// Now relieving pressure with 23 layers would cost:
-/// - tenant A 14 layers
-/// - tenant B 1 layer
-/// - tenant C 8 layers
 async fn collect_eviction_candidates(
-    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
    // get a snapshot of the list of tenants
@@ -709,63 +591,12 @@ async fn collect_eviction_candidates(
        tenant_candidates
            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;
-
-        // keeping the -1 or not decides if every tenant should lose their least recently accessed
-        // layer OR if this should happen in the order of having highest layer count:
-        let fudge = if eviction_order.highest_layer_count_loses_first() {
-            // relative_age vs. tenant layer count:
-            // - 0.1..=1.0 (10 layers)
-            // - 0.01..=1.0 (100 layers)
-            // - 0.001..=1.0 (1000 layers)
-            //
-            // leading to evicting less of the smallest tenants.
-            0
-        } else {
-            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
-            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
-            // be that less than 10k layer evictions is enough, so we would not need to evict from
-            // all tenants.
-            //
-            // as the tenant ordering is now deterministic this could hit the same tenants
-            // disproportionetly on multiple invocations. alternative could be to remember how many
-            // layers did we evict last time from this tenant, and inject that as an additional
-            // fudge here.
-            1
-        };
-
-        let total = tenant_candidates
-            .len()
-            .checked_sub(fudge)
-            .filter(|&x| x > 0)
-            // support 0 or 1 resident layer tenants as well
-            .unwrap_or(1);
-        let divider = total as f32;
-
-        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
+        for (timeline, layer_info) in tenant_candidates.into_iter() {
            let file_size = layer_info.file_size();
-
-            // as we iterate this reverse sorted list, the most recently accessed layer will always
-            // be 1.0; this is for us to evict it last.
-            let relative_last_activity = if matches!(
-                eviction_order,
-                EvictionOrder::RelativeAccessed { .. }
-            ) {
-                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
-                // similarly for u16. unsure how it would help.
-                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
-                    .unwrap_or_else(|val| {
-                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
-                        finite_f32::FiniteF32::ZERO
-                    })
-            } else {
-                finite_f32::FiniteF32::ZERO
-            };
-
            let candidate = EvictionCandidate {
                timeline,
                last_activity_ts: layer_info.last_activity_ts,
                layer: layer_info.layer,
-                relative_last_activity,
            };
            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
@@ -779,19 +610,8 @@ async fn collect_eviction_candidates(

    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
-
-    match eviction_order {
-        EvictionOrder::AbsoluteAccessed => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.last_activity_ts)
-            });
-        }
-        EvictionOrder::RelativeAccessed { .. } => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.relative_last_activity)
-            });
-        }
-    }
+    candidates
+        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));

    Ok(EvictionCandidates::Finished(candidates))
 }
@@ -820,66 +640,6 @@ impl std::ops::Deref for TimelineKey {
    }
 }

-/// A totally ordered f32 subset we can use with sorting functions.
-mod finite_f32 {
-
-    /// A totally ordered f32 subset we can use with sorting functions.
-    #[derive(Clone, Copy, PartialEq)]
-    pub struct FiniteF32(f32);
-
-    impl std::fmt::Debug for FiniteF32 {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            std::fmt::Debug::fmt(&self.0, f)
-        }
-    }
-
-    impl std::fmt::Display for FiniteF32 {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            std::fmt::Display::fmt(&self.0, f)
-        }
-    }
-
-    impl std::cmp::Eq for FiniteF32 {}
-
-    impl std::cmp::PartialOrd for FiniteF32 {
-        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-            Some(self.cmp(other))
-        }
-    }
-
-    impl std::cmp::Ord for FiniteF32 {
-        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-            self.0.total_cmp(&other.0)
-        }
-    }
-
-    impl TryFrom<f32> for FiniteF32 {
-        type Error = f32;
-
-        fn try_from(value: f32) -> Result<Self, Self::Error> {
-            if value.is_finite() {
-                Ok(FiniteF32(value))
-            } else {
-                Err(value)
-            }
-        }
-    }
-
-    impl FiniteF32 {
-        pub const ZERO: FiniteF32 = FiniteF32(0.0);
-
-        pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
-            if (0.0..=1.0).contains(&value) {
-                // -0.0 is within the range, make sure it is assumed 0.0..=1.0
-                let value = value.abs();
-                Ok(FiniteF32(value))
-            } else {
-                Err(value)
-            }
-        }
-    }
-}
-
 mod filesystem_level_usage {
    use anyhow::Context;
    use camino::Utf8Path;
@@ -961,7 +721,6 @@ mod filesystem_level_usage {

    #[test]
    fn max_usage_pct_pressure() {
-        use super::EvictionOrder;
        use super::Usage as _;
        use std::time::Duration;
        use utils::serde_percent::Percent;
@@ -973,7 +732,6 @@ mod filesystem_level_usage {
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
-                eviction_order: EvictionOrder::default(),
            },
            total_bytes: 100_000,
            avail_bytes: 0,
--- a/pageserver/src/failpoint_support.rs
+++ b/pageserver/src/failpoint_support.rs
@@ -1,14 +1,3 @@
-//! Failpoint support code shared between pageserver and safekeepers.
-
-use crate::http::{
-    error::ApiError,
-    json::{json_request, json_response},
-};
-use hyper::{Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
-use tokio_util::sync::CancellationToken;
-use tracing::*;
-
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
@@ -36,7 +25,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
 // Helper function used by the macro. (A function has nicer scoping so we
 // don't need to decorate everything with "::")
 #[doc(hidden)]
-pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
    let millis = duration_str.parse::<u64>().unwrap();
    let d = std::time::Duration::from_millis(millis);

@@ -82,7 +71,7 @@ pub fn init() -> fail::FailScenario<'static> {
    scenario
 }

-pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
    if actions == "exit" {
        fail::cfg_callback(name, exit_failpoint)
    } else {
@@ -95,45 +84,3 @@ fn exit_failpoint() {
    tracing::info!("Exit requested by failpoint");
    std::process::exit(1);
 }
-
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
-/// Configure failpoints through http.
-pub async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Cannot manage failpoints because storage was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow::anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -159,12 +159,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-        "412":
-          description: Deletion may not proceed, tenant is not in Active state
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/PreconditionFailedError"
        "500":
          description: Generic operation error
          content:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -25,7 +25,6 @@ use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
-use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -67,6 +66,9 @@ use utils::{
    lsn::Lsn,
 };

+// Imports only used for testing APIs
+use pageserver_api::models::ConfigureFailpointsRequest;
+
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
@@ -152,7 +154,6 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::AncestorStopping(_) => {
                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
-            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
@@ -307,7 +308,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
-            Cancelled => ApiError::ShuttingDown,
        }
    }
 }
@@ -886,9 +886,7 @@ async fn tenant_delete_handler(

    let state = get_state(&request);

-    state
-        .tenant_manager
-        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
            shard = %tenant_shard_id.shard_slug()
@@ -1292,6 +1290,34 @@ async fn handle_tenant_break(
    json_response(StatusCode::OK, ())
 }

+async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Cannot manage failpoints because pageserver was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
+
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
    mut request: Request<Body>,
@@ -1540,22 +1566,19 @@ async fn disk_usage_eviction_run(
    struct Config {
        /// How many bytes to evict before reporting that pressure is relieved.
        evict_bytes: u64,
-
-        #[serde(default)]
-        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
    }

    #[derive(Debug, Clone, Copy, serde::Serialize)]
    struct Usage {
        // remains unchanged after instantiation of the struct
-        evict_bytes: u64,
+        config: Config,
        // updated by `add_available_bytes`
        freed_bytes: u64,
    }

    impl crate::disk_usage_eviction_task::Usage for Usage {
        fn has_pressure(&self) -> bool {
-            self.evict_bytes > self.freed_bytes
+            self.config.evict_bytes > self.freed_bytes
        }

        fn add_available_bytes(&mut self, bytes: u64) {
@@ -1566,7 +1589,7 @@ async fn disk_usage_eviction_run(
    let config = json_request::<Config>(&mut r).await?;

    let usage = Usage {
-        evict_bytes: config.evict_bytes,
+        config,
        freed_bytes: 0,
    };

@@ -1581,11 +1604,7 @@ async fn disk_usage_eviction_run(
    let state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state,
-        storage,
-        usage,
-        config.eviction_order,
-        &cancel,
+        &state, storage, usage, &cancel,
    )
    .await;

--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -21,7 +21,6 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
-use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
@@ -313,16 +312,13 @@ async fn import_wal(
        waldecoder.feed_bytes(&buf);

        let mut nrecords = 0;
-        let mut modification = tline.begin_modification(last_lsn);
+        let mut modification = tline.begin_modification(endpoint);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
-                WAL_INGEST.records_committed.inc();
-
-                modification.commit(ctx).await?;
                last_lsn = lsn;

                nrecords += 1;
@@ -452,14 +448,13 @@ pub async fn import_wal_from_tar(

        waldecoder.feed_bytes(&bytes[offset..]);

-        let mut modification = tline.begin_modification(last_lsn);
+        let mut modification = tline.begin_modification(end_lsn);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
-                modification.commit(ctx).await?;
                last_lsn = lsn;

                debug!("imported record at {} (end {})", lsn, end_lsn);
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,6 +25,8 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+pub mod failpoint_support;
+
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -25,7 +25,6 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
-use std::borrow::Cow;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -54,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::{rel_block_to_key, Version};
+use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -62,9 +61,6 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
-use crate::tenant::timeline::WaitLsnError;
-use crate::tenant::GetTimelineError;
-use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

@@ -287,64 +283,6 @@ struct PageServerHandler {
    connection_ctx: RequestContext,
 }

-#[derive(thiserror::Error, Debug)]
-enum PageStreamError {
-    /// We encountered an error that should prompt the client to reconnect:
-    /// in practice this means we drop the connection without sending a response.
-    #[error("Reconnect required: {0}")]
-    Reconnect(Cow<'static, str>),
-
-    /// We were instructed to shutdown while processing the query
-    #[error("Shutting down")]
-    Shutdown,
-
-    /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error: {0}")]
-    Read(PageReconstructError),
-
-    /// Ran out of time waiting for an LSN
-    #[error("LSN timeout: {0}")]
-    LsnTimeout(WaitLsnError),
-
-    /// The entity required to serve the request (tenant or timeline) is not found,
-    /// or is not found in a suitable state to serve a request.
-    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
-
-    /// Request asked for something that doesn't make sense, like an invalid LSN
-    #[error("Bad request: {0}")]
-    BadRequest(std::borrow::Cow<'static, str>),
-}
-
-impl From<PageReconstructError> for PageStreamError {
-    fn from(value: PageReconstructError) -> Self {
-        match value {
-            PageReconstructError::Cancelled => Self::Shutdown,
-            e => Self::Read(e),
-        }
-    }
-}
-
-impl From<GetActiveTimelineError> for PageStreamError {
-    fn from(value: GetActiveTimelineError) -> Self {
-        match value {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
-            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
-            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
-        }
-    }
-}
-
-impl From<WaitLsnError> for PageStreamError {
-    fn from(value: WaitLsnError) -> Self {
-        match value {
-            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
-            WaitLsnError::Shutdown => Self::Shutdown,
-            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
-        }
-    }
-}
-
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
@@ -490,7 +428,7 @@ impl PageServerHandler {
        // Check that the timeline exists
        let timeline = tenant
            .get_timeline(timeline_id, true)
-            .map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
+            .map_err(|e| anyhow::anyhow!(e))?;

        // Avoid starting new requests if the timeline has already started shutting down,
        // and block timeline shutdown until this request is complete, or drops out due
@@ -582,44 +520,32 @@ impl PageServerHandler {
                }
            };

-            match response {
-                Err(PageStreamError::Shutdown) => {
+            if let Err(e) = &response {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
-                    span.in_scope(|| info!("dropping connection due to shutdown"));
+                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
                    return Err(QueryError::Shutdown);
                }
-                Err(PageStreamError::Reconnect(reason)) => {
-                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                    return Err(QueryError::Reconnect);
-                }
-                Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
-                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
-                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
-                    //
-                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                    // because wait_lsn etc will drop out
-                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                    // is_canceled(): [`Timeline::shutdown`]` has entered
-                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-                r => {
-                    let response_msg = r.unwrap_or_else(|e| {
-                        // print the all details to the log with {:#}, but for the client the
-                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                        // here includes cancellation which is not an error.
-                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
-                    });
-
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-                    self.flush_cancellable(pgb, &timeline.cancel).await?;
-                }
            }
+
+            let response = response.unwrap_or_else(|e| {
+                // print the all details to the log with {:#}, but for the client the
+                // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                // here includes cancellation which is not an error.
+                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: e.to_string(),
+                })
+            });
+
+            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
+            self.flush_cancellable(pgb, &timeline.cancel).await?;
        }
        Ok(())
    }
@@ -766,7 +692,7 @@ impl PageServerHandler {
        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
-    ) -> Result<Lsn, PageStreamError> {
+    ) -> anyhow::Result<Lsn> {
        if latest {
            // Latest page version was requested. If LSN is given, it is a hint
            // to the page server that there have been no modifications to the
@@ -797,19 +723,15 @@ impl PageServerHandler {
            }
        } else {
            if lsn == Lsn(0) {
-                return Err(PageStreamError::BadRequest(
-                    "invalid LSN(0) in request".into(),
-                ));
+                anyhow::bail!("invalid LSN(0) in request");
            }
            timeline.wait_lsn(lsn, ctx).await?;
        }
-
-        if lsn < **latest_gc_cutoff_lsn {
-            return Err(PageStreamError::BadRequest(format!(
-                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                lsn, **latest_gc_cutoff_lsn
-            ).into()));
-        }
+        anyhow::ensure!(
+            lsn >= **latest_gc_cutoff_lsn,
+            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+            lsn, **latest_gc_cutoff_lsn
+        );
        Ok(lsn)
    }

@@ -818,14 +740,14 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamExistsRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .get_rel_exists(req.rel, lsn, req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -838,15 +760,13 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamNblocksRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

-        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
-            .await?;
+        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
            n_blocks,
@@ -858,20 +778,14 @@ impl PageServerHandler {
        timeline: &Timeline,
        req: &PagestreamDbSizeRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let total_blocks = timeline
-            .get_db_size(
-                DEFAULTTABLESPACE_OID,
-                req.dbnode,
-                Version::Lsn(lsn),
-                req.latest,
-                ctx,
-            )
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -880,35 +794,30 @@ impl PageServerHandler {
        }))
    }

-    async fn do_handle_get_page_at_lsn_request(
-        &self,
-        timeline: &Timeline,
-        req: &PagestreamGetPageRequest,
-        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
-        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
-            .await?;
-
-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
-        }))
-    }
-
    async fn handle_get_page_at_lsn_request(
        &self,
        timeline: &Timeline,
        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
-    ) -> Result<PagestreamBeMessage, PageStreamError> {
+    ) -> anyhow::Result<PagestreamBeMessage> {
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+        /*
+        // Add a 1s delay to some requests. The delay helps the requests to
+        // hit the race condition from github issue #1047 more easily.
+        use rand::Rng;
+        if rand::thread_rng().gen::<u8>() < 5 {
+            std::thread::sleep(std::time::Duration::from_millis(1000));
+        }
+        */
+
        let key = rel_block_to_key(req.rel, req.blkno);
-        if timeline.get_shard_identity().is_key_local(&key) {
-            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
-                .await
+        let page = if timeline.get_shard_identity().is_key_local(&key) {
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
        } else {
            // The Tenant shard we looked up at connection start does not hold this particular
            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
@@ -927,30 +836,30 @@ impl PageServerHandler {
                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
                    // We already know this tenant exists in general, because we resolved it at
                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                    // mapping is out of date.
-                    tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
-                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
-                    // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                    // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                    // and talk to a different pageserver.
-                    return Err(PageStreamError::Reconnect(
-                        "getpage@lsn request routed to wrong shard".into(),
-                    ));
+                    // the requested page is not present on this node.
+
+                    // TODO: this should be some kind of structured error that the client will understand,
+                    // so that it can block until its config is updated: this error is expected in the case
+                    // that the Tenant's shards' placements are being updated and the client hasn't been
+                    // informed yet.
+                    //
+                    // https://github.com/neondatabase/neon/issues/6038
+                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
                }
                Err(e) => return Err(e.into()),
            };

            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline
-                .gate
-                .enter()
-                .map_err(|_| PageStreamError::Shutdown)?;
+            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
+        };

-            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
-                .await
-        }
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
+        }))
    }

    #[allow(clippy::too_many_arguments)]
@@ -1091,7 +1000,9 @@ impl PageServerHandler {
        )
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant.get_timeline(timeline_id, true)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
        Ok(timeline)
    }
 }
@@ -1513,15 +1424,14 @@ enum GetActiveTimelineError {
    #[error(transparent)]
    Tenant(GetActiveTenantError),
    #[error(transparent)]
-    Timeline(#[from] GetTimelineError),
+    Timeline(anyhow::Error),
 }

 impl From<GetActiveTimelineError> for QueryError {
    fn from(e: GetActiveTimelineError) -> Self {
        match e {
-            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
            GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
        }
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{ensure, Context};
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,7 +147,6 @@ impl Timeline {
    {
        DatadirModification {
            tline: self,
-            pending_lsns: Vec::new(),
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
@@ -160,11 +159,11 @@ impl Timeline {
    //------------------------------------------------------------------------------

    /// Look up given page version.
-    pub(crate) async fn get_rel_page_at_lsn(
+    pub async fn get_rel_page_at_lsn(
        &self,
        tag: RelTag,
        blknum: BlockNumber,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
@@ -174,47 +173,44 @@ impl Timeline {
            ));
        }

-        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag,
-                blknum,
-                version.get_lsn(),
-                nblocks
+                tag, blknum, lsn, nblocks
            );
            return Ok(ZERO_PAGE.clone());
        }

        let key = rel_block_to_key(tag, blknum);
-        version.get(self, key, ctx).await
+        self.get(key, lsn, ctx).await
    }

    // Get size of a database in blocks
-    pub(crate) async fn get_db_size(
+    pub async fn get_db_size(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<usize, PageReconstructError> {
        let mut total_blocks = 0;

-        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
+        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
    }

    /// Get size of a relation file
-    pub(crate) async fn get_rel_size(
+    pub async fn get_rel_size(
        &self,
        tag: RelTag,
-        version: Version<'_>,
+        lsn: Lsn,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
@@ -224,12 +220,12 @@ impl Timeline {
            ));
        }

-        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
            return Ok(nblocks);
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, latest, ctx).await?
+            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -239,7 +235,7 @@ impl Timeline {
        }

        let key = rel_size_to_key(tag);
-        let mut buf = version.get(self, key, ctx).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
        let nblocks = buf.get_u32_le();

        if latest {
@@ -250,16 +246,16 @@ impl Timeline {
            // latest=true, then it can not cause cache corruption, because with latest=true
            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+            self.update_cached_rel_size(tag, lsn, nblocks);
        }
        Ok(nblocks)
    }

    /// Does relation exist?
-    pub(crate) async fn get_rel_exists(
+    pub async fn get_rel_exists(
        &self,
        tag: RelTag,
-        version: Version<'_>,
+        lsn: Lsn,
        _latest: bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
@@ -270,12 +266,12 @@ impl Timeline {
        }

        // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
            return Ok(true);
        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -291,16 +287,16 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn list_rels(
+    pub async fn list_rels(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashSet<RelTag>, PageReconstructError> {
        // fetch directory listing
        let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -319,7 +315,7 @@ impl Timeline {
    }

    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
+    pub async fn get_slru_page_at_lsn(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -332,29 +328,29 @@ impl Timeline {
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_size(
+    pub async fn get_slru_segment_size(
        &self,
        kind: SlruKind,
        segno: u32,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = version.get(self, key, ctx).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
        Ok(buf.get_u32_le())
    }

    /// Get size of an SLRU segment
-    pub(crate) async fn get_slru_segment_exists(
+    pub async fn get_slru_segment_exists(
        &self,
        kind: SlruKind,
        segno: u32,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        // fetch directory listing
        let key = slru_dir_to_key(kind);
-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;

        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -372,7 +368,7 @@ impl Timeline {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    pub(crate) async fn find_lsn_for_timestamp(
+    pub async fn find_lsn_for_timestamp(
        &self,
        search_timestamp: TimestampTz,
        cancel: &CancellationToken,
@@ -452,7 +448,7 @@ impl Timeline {
    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
    /// with a smaller/larger timestamp.
    ///
-    pub(crate) async fn is_latest_commit_timestamp_ge_than(
+    pub async fn is_latest_commit_timestamp_ge_than(
        &self,
        search_timestamp: TimestampTz,
        probe_lsn: Lsn,
@@ -475,7 +471,7 @@ impl Timeline {
    /// Obtain the possible timestamp range for the given lsn.
    ///
    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub(crate) async fn get_timestamp_for_lsn(
+    pub async fn get_timestamp_for_lsn(
        &self,
        probe_lsn: Lsn,
        ctx: &RequestContext,
@@ -505,11 +501,11 @@ impl Timeline {
        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
    ) -> Result<T, PageReconstructError> {
        for segno in self
-            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
+            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
            .await?
        {
            let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
                .await?;
            for blknum in (0..nblocks).rev() {
                let clog_page = self
@@ -532,36 +528,36 @@ impl Timeline {
    }

    /// Get a list of SLRU segments
-    pub(crate) async fn list_slru_segments(
+    pub async fn list_slru_segments(
        &self,
        kind: SlruKind,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<HashSet<u32>, PageReconstructError> {
        // fetch directory entry
        let key = slru_dir_to_key(kind);

-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;
        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => Ok(dir.segments),
            Err(e) => Err(PageReconstructError::from(e)),
        }
    }

-    pub(crate) async fn get_relmap_file(
+    pub async fn get_relmap_file(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        version: Version<'_>,
+        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        let key = relmap_file_key(spcnode, dbnode);

-        let buf = version.get(self, key, ctx).await?;
+        let buf = self.get(key, lsn, ctx).await?;
        Ok(buf)
    }

-    pub(crate) async fn list_dbdirs(
+    pub async fn list_dbdirs(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -575,7 +571,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_twophase_file(
+    pub async fn get_twophase_file(
        &self,
        xid: TransactionId,
        lsn: Lsn,
@@ -586,7 +582,7 @@ impl Timeline {
        Ok(buf)
    }

-    pub(crate) async fn list_twophase_files(
+    pub async fn list_twophase_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -600,7 +596,7 @@ impl Timeline {
        }
    }

-    pub(crate) async fn get_control_file(
+    pub async fn get_control_file(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -608,7 +604,7 @@ impl Timeline {
        self.get(CONTROLFILE_KEY, lsn, ctx).await
    }

-    pub(crate) async fn get_checkpoint(
+    pub async fn get_checkpoint(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -616,7 +612,7 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    pub(crate) async fn list_aux_files(
+    pub async fn list_aux_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -656,10 +652,7 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
-                .await?
-            {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
@@ -699,7 +692,7 @@ impl Timeline {
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
+                .list_rels(spcnode, dbnode, lsn, ctx)
                .await?
                .into_iter()
                .collect();
@@ -806,39 +799,18 @@ pub struct DatadirModification<'a> {
    /// in the state in 'tline' yet.
    pub tline: &'a Timeline,

-    /// Current LSN of the modification
-    lsn: Lsn,
+    /// Lsn assigned by begin_modification
+    pub lsn: Lsn,

    // The modifications are not applied directly to the underlying key-value store.
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
-    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
-    pending_deletions: Vec<(Range<Key>, Lsn)>,
+    pending_updates: HashMap<Key, Value>,
+    pending_deletions: Vec<Range<Key>>,
    pending_nblocks: i64,
 }

 impl<'a> DatadirModification<'a> {
-    /// Get the current lsn
-    pub(crate) fn get_lsn(&self) -> Lsn {
-        self.lsn
-    }
-
-    /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
-            lsn >= self.lsn,
-            "setting an older lsn {} than {} is not allowed",
-            lsn,
-            self.lsn
-        );
-        if lsn > self.lsn {
-            self.pending_lsns.push(self.lsn);
-            self.lsn = lsn;
-        }
-        Ok(())
-    }
-
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -1012,9 +984,11 @@ impl<'a> DatadirModification<'a> {
        dbnode: Oid,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let req_lsn = self.tline.get_last_record_lsn();
+
        let total_blocks = self
            .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
+            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
            .await?;

        // Remove entry from dbdir
@@ -1103,11 +1077,8 @@ impl<'a> DatadirModification<'a> {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        if self
-            .tline
-            .get_rel_exists(rel, Version::Modified(self), true, ctx)
-            .await?
-        {
+        let last_lsn = self.tline.get_last_record_lsn();
+        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
            let size_key = rel_size_to_key(rel);
            // Fetch the old size first
            let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1352,23 +1323,17 @@ impl<'a> DatadirModification<'a> {
        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
-        for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
-                if is_rel_block_key(&key) || is_slru_block_key(key) {
-                    // This bails out on first error without modifying pending_updates.
-                    // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
-                } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
-                }
+        let mut retained_pending_updates = HashMap::new();
+        for (key, value) in self.pending_updates.drain() {
+            if is_rel_block_key(&key) || is_slru_block_key(key) {
+                // This bails out on first error without modifying pending_updates.
+                // That's Ok, cf this function's doc comment.
+                writer.put(key, self.lsn, &value, ctx).await?;
+            } else {
+                retained_pending_updates.insert(key, value);
            }
        }
-
-        self.pending_updates = retained_pending_updates;
+        self.pending_updates.extend(retained_pending_updates);

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1385,28 +1350,18 @@ impl<'a> DatadirModification<'a> {
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let writer = self.tline.writer().await;
-
+        let lsn = self.lsn;
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

-        if !self.pending_updates.is_empty() {
-            writer.put_batch(&self.pending_updates, ctx).await?;
-            self.pending_updates.clear();
+        for (key, value) in self.pending_updates.drain() {
+            writer.put(key, lsn, &value, ctx).await?;
+        }
+        for key_range in self.pending_deletions.drain(..) {
+            writer.delete(key_range, lsn).await?;
        }

-        if !self.pending_deletions.is_empty() {
-            writer.delete_batch(&self.pending_deletions).await?;
-            self.pending_deletions.clear();
-        }
-
-        self.pending_lsns.push(self.lsn);
-        for pending_lsn in self.pending_lsns.drain(..) {
-            // Ideally, we should be able to call writer.finish_write() only once
-            // with the highest LSN. However, the last_record_lsn variable in the
-            // timeline keeps track of the latest LSN and the immediate previous LSN
-            // so we need to record every LSN to not leave a gap between them.
-            writer.finish_write(pending_lsn);
-        }
+        writer.finish_write(lsn);

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1415,86 +1370,44 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub(crate) fn len(&self) -> usize {
-        self.pending_updates.len() + self.pending_deletions.len()
+    pub(crate) fn is_empty(&self) -> bool {
+        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
    }

    // Internal helper functions to batch the modifications

    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the latest pending updated
+        // Have we already updated the same key? Read the pending updated
        // version in that case.
        //
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
-                return if let Value::Image(img) = value {
-                    Ok(img.clone())
-                } else {
-                    // Currently, we never need to read back a WAL record that we
-                    // inserted in the same "transaction". All the metadata updates
-                    // work directly with Images, and we never need to read actual
-                    // data pages. We could handle this if we had to, by calling
-                    // the walredo manager, but let's keep it simple for now.
-                    Err(PageReconstructError::from(anyhow::anyhow!(
-                        "unexpected pending WAL record"
-                    )))
-                };
+        if let Some(value) = self.pending_updates.get(&key) {
+            if let Value::Image(img) = value {
+                Ok(img.clone())
+            } else {
+                // Currently, we never need to read back a WAL record that we
+                // inserted in the same "transaction". All the metadata updates
+                // work directly with Images, and we never need to read actual
+                // data pages. We could handle this if we had to, by calling
+                // the walredo manager, but let's keep it simple for now.
+                Err(PageReconstructError::from(anyhow::anyhow!(
+                    "unexpected pending WAL record"
+                )))
            }
+        } else {
+            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+            self.tline.get(key, lsn, ctx).await
        }
-        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-        self.tline.get(key, lsn, ctx).await
    }

    fn put(&mut self, key: Key, val: Value) {
-        let values = self.pending_updates.entry(key).or_default();
-        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
-            if *last_lsn == self.lsn {
-                *last_value = val;
-                return;
-            }
-        }
-        values.push((self.lsn, val));
+        self.pending_updates.insert(key, val);
    }

    fn delete(&mut self, key_range: Range<Key>) {
        trace!("DELETE {}-{}", key_range.start, key_range.end);
-        self.pending_deletions.push((key_range, self.lsn));
-    }
-}
-
-/// This struct facilitates accessing either a committed key from the timeline at a
-/// specific LSN, or the latest uncommitted key from a pending modification.
-/// During WAL ingestion, the records from multiple LSNs may be batched in the same
-/// modification before being flushed to the timeline. Hence, the routines in WalIngest
-/// need to look up the keys in the modification first before looking them up in the
-/// timeline to not miss the latest updates.
-#[derive(Clone, Copy)]
-pub enum Version<'a> {
-    Lsn(Lsn),
-    Modified(&'a DatadirModification<'a>),
-}
-
-impl<'a> Version<'a> {
-    async fn get(
-        &self,
-        timeline: &Timeline,
-        key: Key,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        match self {
-            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
-            Version::Modified(modification) => modification.get(key, ctx).await,
-        }
-    }
-
-    fn get_lsn(&self) -> Lsn {
-        match self {
-            Version::Lsn(lsn) => *lsn,
-            Version::Modified(modification) => modification.lsn,
-        }
+        self.pending_deletions.push(key_range);
    }
 }

@@ -1863,7 +1776,6 @@ pub fn is_inherited_key(key: Key) -> bool {
    key != AUX_FILES_KEY
 }

-/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
@@ -1878,6 +1790,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
    })
 }
+
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
    // else, but that has not been needed in a long time.
    std::env::var("TOKIO_WORKER_THREADS")
        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
 });

 #[derive(Debug, Clone, Copy)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,7 +33,6 @@ use tracing::*;
 use utils::backoff;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::failpoint_support;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
@@ -56,7 +55,6 @@ use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
-use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -596,9 +594,10 @@ impl Tenant {
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
+        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id,
+            tenant_shard_id.tenant_id,
        )));

        let TenantSharedResources {
@@ -891,7 +890,7 @@ impl Tenant {
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

-        failpoint_support::sleep_millis_async!("before-attaching-tenant");
+        crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");

        let preload = match preload {
            Some(p) => p,
@@ -1003,7 +1002,7 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        failpoint_support::sleep_millis_async!("attach-before-activate");
+        crate::failpoint_support::sleep_millis_async!("attach-before-activate");

        info!("Done");

@@ -1145,9 +1144,10 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        reason: String,
    ) -> Arc<Tenant> {
+        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id,
+            tenant_shard_id.tenant_id,
        )));
        Arc::new(Tenant::new(
            TenantState::Broken {
@@ -1759,15 +1759,7 @@ impl Tenant {
                    // decoding the new WAL might need to look up previous pages, relation
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
-                    ancestor_timeline
-                        .wait_lsn(*lsn, ctx)
-                        .await
-                        .map_err(|e| match e {
-                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
-                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
-                            }
-                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
-                        })?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                }

                self.branch_timeline(
@@ -2847,7 +2839,9 @@ impl Tenant {
            }
        };

-        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+        crate::failpoint_support::sleep_millis_async!(
+            "gc_iteration_internal_after_getting_gc_timelines"
+        );

        // If there is nothing to GC, we don't want any messages in the INFO log.
        if !gc_timelines.is_empty() {
@@ -3140,7 +3134,6 @@ impl Tenant {

    /// For unit tests, make this visible so that other modules can directly create timelines
    #[cfg(test)]
-    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
    pub(crate) async fn bootstrap_timeline_test(
        &self,
        timeline_id: TimelineId,
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -46,8 +46,6 @@ pub mod defaults {
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
-
-    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -48,9 +48,6 @@ pub(crate) enum DeleteTenantError {
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),

-    #[error("Cancelled")]
-    Cancelled,
-
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -130,7 +130,7 @@ impl TenantsMap {

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_attached_shard(
+    fn resolve_shard(
        &self,
        tenant_id: &TenantId,
        selector: ShardSelector,
@@ -140,27 +140,25 @@ impl TenantsMap {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    // Ignore all slots that don't contain an attached tenant
-                    let tenant = match &slot.1 {
-                        TenantSlot::Attached(t) => t,
-                        _ => continue,
-                    };
-
                    match selector {
                        ShardSelector::First => return Some(*slot.0),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return Some(*slot.0)
                        }
                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                            }
+                            if let Some(tenant) = slot.1.get_attached() {
+                                // First slot we see for this tenant, calculate the expected shard number
+                                // for the key: we will use this for checking if this and subsequent
+                                // slots contain the key, rather than recalculating the hash each time.
+                                if want_shard.is_none() {
+                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                                }

-                            if Some(tenant.shard_identity.number) == want_shard {
-                                return Some(*slot.0);
+                                if Some(tenant.shard_identity.number) == want_shard {
+                                    return Some(*slot.0);
+                                }
+                            } else {
+                                continue;
                            }
                        }
                        _ => continue,
@@ -1093,71 +1091,6 @@ impl TenantManager {
                .collect(),
        }
    }
-
-    pub(crate) async fn delete_tenant(
-        &self,
-        tenant_shard_id: TenantShardId,
-        activation_timeout: Duration,
-    ) -> Result<(), DeleteTenantError> {
-        // We acquire a SlotGuard during this function to protect against concurrent
-        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-        // have to return the Tenant to the map while the background deletion runs.
-        //
-        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-        // Currently, deletion requires a reference to the tenants map in order to
-        // keep the Tenant in the map until deletion is complete, and then remove
-        // it at the end.
-        //
-        // See https://github.com/neondatabase/neon/issues/5080
-
-        let slot_guard =
-            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
-
-        // unwrap is safe because we used MustExist mode when acquiring
-        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
-            TenantSlot::Attached(tenant) => tenant.clone(),
-            _ => {
-                // Express "not attached" as equivalent to "not found"
-                return Err(DeleteTenantError::NotAttached);
-            }
-        };
-
-        match tenant.current_state() {
-            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
-            }
-            _ => {
-                tenant
-                    .wait_to_become_active(activation_timeout)
-                    .await
-                    .map_err(|e| match e {
-                        GetActiveTenantError::WillNotBecomeActive(_) => {
-                            DeleteTenantError::InvalidState(tenant.current_state())
-                        }
-                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
-                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
-                        GetActiveTenantError::WaitForActiveTimeout {
-                            latest_state: _latest_state,
-                            wait_time: _wait_time,
-                        } => DeleteTenantError::InvalidState(tenant.current_state()),
-                    })?;
-            }
-        }
-
-        let result = DeleteTenantFlow::run(
-            self.conf,
-            self.resources.remote_storage.clone(),
-            &TENANTS,
-            tenant,
-        )
-        .await;
-
-        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
-        slot_guard.revert();
-        result
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1259,11 +1192,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
        let locked = TENANTS.read().unwrap();

        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked
-            .resolve_attached_shard(&tenant_id, shard_selector)
-            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
-                tenant_id,
-            )))?;
+        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
+            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
+        )?;

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
@@ -1337,6 +1268,41 @@ pub(crate) async fn get_active_tenant_with_timeout(
    Ok(tenant)
 }

+pub(crate) async fn delete_tenant(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_shard_id: TenantShardId,
+) -> Result<(), DeleteTenantError> {
+    // We acquire a SlotGuard during this function to protect against concurrent
+    // changes while the ::prepare phase of DeleteTenantFlow executes, but then
+    // have to return the Tenant to the map while the background deletion runs.
+    //
+    // TODO: refactor deletion to happen outside the lifetime of a Tenant.
+    // Currently, deletion requires a reference to the tenants map in order to
+    // keep the Tenant in the map until deletion is complete, and then remove
+    // it at the end.
+    //
+    // See https://github.com/neondatabase/neon/issues/5080
+
+    // TODO(sharding): make delete API sharding-aware
+    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+
+    // unwrap is safe because we used MustExist mode when acquiring
+    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
+        TenantSlot::Attached(tenant) => tenant.clone(),
+        _ => {
+            // Express "not attached" as equivalent to "not found"
+            return Err(DeleteTenantError::NotAttached);
+        }
+    };
+
+    let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
+
+    // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
+    slot_guard.revert();
+    result
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
    #[error("Tenant {0}")]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -818,25 +818,8 @@ impl RemoteTimelineClient {
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
    ) {
-        // Filter out any layers which were not created by this tenant shard.  These are
-        // layers that originate from some ancestor shard after a split, and may still
-        // be referenced by other shards. We are free to delete them locally and remove
-        // them from our index (and would have already done so when we reach this point
-        // in the code), but we may not delete them remotely.
-        with_metadata.retain(|(name, meta)| {
-            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
-                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
-            if !retain {
-                tracing::debug!(
-                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
-                    meta.shard
-                );
-            }
-            retain
-        });
-
        for (name, meta) in &with_metadata {
            info!(
                "scheduling deletion of layer {}{} (shard {})",
@@ -2209,6 +2192,15 @@ mod tests {

        let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();

+        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
+        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
+            timeline_path
+                .strip_prefix(&test_state.harness.conf.workdir)
+                .unwrap(),
+        );
+
+        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
+
        let index_path = test_state.harness.remote_fs_dir.join(
            remote_index_path(
                &test_state.harness.tenant_shard_id,
@@ -2217,10 +2209,6 @@ mod tests {
            )
            .get_path(),
        );
-
-        std::fs::create_dir_all(index_path.parent().unwrap())
-            .expect("creating test dir should work");
-
        eprintln!("Writing {index_path}");
        std::fs::write(&index_path, index_part_bytes).unwrap();
        example_index_part
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::RwLock;

 use super::{DeltaLayerWriter, ResidentLayer};

@@ -246,43 +246,16 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub(crate) async fn put_value(
+    pub async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
        val: &Value,
        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
-    }
-
-    pub(crate) async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
-        for (key, vals) in values {
-            for (lsn, val) in vals {
-                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
-                    .await?;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_value_locked(
-        &self,
-        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
-        key: Key,
-        lsn: Lsn,
-        val: &Value,
-        ctx: &RequestContext,
    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
+        let inner: &mut _ = &mut *self.inner.write().await;
+        self.assert_writable();

        let off = {
            // Avoid doing allocations for "small" values.
@@ -291,7 +264,7 @@ impl InMemoryLayer {
            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
            buf.clear();
            val.ser_into(&mut buf)?;
-            locked_inner
+            inner
                .file
                .write_blob(
                    &buf,
@@ -302,7 +275,7 @@ impl InMemoryLayer {
                .await?
        };

-        let vec_map = locked_inner.index.entry(key).or_default();
+        let vec_map = inner.index.entry(key).or_default();
        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -312,11 +285,13 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
+
        Ok(())
    }

+    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -373,20 +373,15 @@ pub struct GcInfo {
 }

 /// An error happened in a get() operation.
-#[derive(thiserror::Error, Debug)]
-pub(crate) enum PageReconstructError {
+#[derive(thiserror::Error)]
+pub enum PageReconstructError {
    #[error(transparent)]
    Other(#[from] anyhow::Error),

-    #[error("Ancestor LSN wait error: {0}")]
-    AncestorLsnTimeout(#[from] WaitLsnError),
-
    /// The operation was cancelled
-    #[error("Cancelled")]
    Cancelled,

    /// The ancestor of this is being stopped
-    #[error("ancestor timeline {0} is being stopped")]
    AncestorStopping(TimelineId),

    /// An error happened replaying WAL records
@@ -407,6 +402,32 @@ enum FlushLayerError {
    Other(#[from] anyhow::Error),
 }

+impl std::fmt::Debug for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+impl std::fmt::Display for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
    Initial,
@@ -431,21 +452,6 @@ impl std::fmt::Debug for Timeline {
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum WaitLsnError {
-    // Called on a timeline which is shutting down
-    #[error("Shutdown")]
-    Shutdown,
-
-    // Called on an timeline not in active state or shutting down
-    #[error("Bad state (not active)")]
-    BadState,
-
-    // Timeout expired while waiting for LSN to catch up with goal.
-    #[error("{0}")]
-    Timeout(String),
-}
-
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -480,7 +486,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub(crate) async fn get(
+    pub async fn get(
        &self,
        key: Key,
        lsn: Lsn,
@@ -490,11 +496,6 @@ impl Timeline {
            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
        }

-        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
-        // already checked the key against the shard_identity when looking up the Timeline from
-        // page_service.
-        debug_assert!(!self.shard_identity.is_key_disposable(&key));
-
        // XXX: structured stats collection for layer eviction here.
        trace!(
            "get page request for {}@{} from task kind {:?}",
@@ -628,28 +629,24 @@ impl Timeline {
    /// You should call this before any of the other get_* or list_* functions. Calling
    /// those functions with an LSN that has been processed yet is an error.
    ///
-    pub(crate) async fn wait_lsn(
+    pub async fn wait_lsn(
        &self,
        lsn: Lsn,
        _ctx: &RequestContext, /* Prepare for use by cancellation */
-    ) -> Result<(), WaitLsnError> {
-        if self.cancel.is_cancelled() {
-            return Err(WaitLsnError::Shutdown);
-        } else if !self.is_active() {
-            return Err(WaitLsnError::BadState);
-        }
+    ) -> anyhow::Result<()> {
+        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");

        // This should never be called from the WAL receiver, because that could lead
        // to a deadlock.
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
            "wait_lsn cannot be called in WAL receiver"
        );
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
            "wait_lsn cannot be called in WAL receiver"
        );
-        debug_assert!(
+        anyhow::ensure!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
            "wait_lsn cannot be called in WAL receiver"
        );
@@ -663,22 +660,18 @@ impl Timeline {
        {
            Ok(()) => Ok(()),
            Err(e) => {
-                use utils::seqwait::SeqWaitError::*;
-                match e {
-                    Shutdown => Err(WaitLsnError::Shutdown),
-                    Timeout => {
-                        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
-                        drop(_timer);
-                        let walreceiver_status = self.walreceiver_status();
-                        Err(WaitLsnError::Timeout(format!(
+                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
+                drop(_timer);
+                let walreceiver_status = self.walreceiver_status();
+                Err(anyhow::Error::new(e).context({
+                    format!(
                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
                        lsn,
                        self.get_last_record_lsn(),
                        self.get_disk_consistent_lsn(),
                        walreceiver_status,
-                    )))
-                    }
-                }
+                    )
+                }))
            }
        }
    }
@@ -1466,7 +1459,6 @@ impl Timeline {
                max_lsn_wal_lag,
                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                availability_zone: self.conf.availability_zone.clone(),
-                ingest_batch_size: self.conf.ingest_batch_size,
            },
            broker_client,
            ctx,
@@ -2231,13 +2223,13 @@ impl Timeline {
                    return Err(layer_traversal_error(
                        if cfg!(test) {
                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
+                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                            )
                        } else {
                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
+                                "could not find data for key {} at LSN {}, for request at LSN {}",
+                                key, cont_lsn, request_lsn
                            )
                        },
                        traversal_path,
@@ -2297,12 +2289,11 @@ impl Timeline {
                ancestor
                    .wait_lsn(timeline.ancestor_lsn, ctx)
                    .await
-                    .map_err(|e| match e {
-                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
-                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
-                        e @ WaitLsnError::BadState => {
-                            PageReconstructError::Other(anyhow::anyhow!(e))
-                        }
+                    .with_context(|| {
+                        format!(
+                            "wait for lsn {} on ancestor timeline_id={}",
+                            timeline.ancestor_lsn, ancestor.timeline_id
+                        )
                    })?;

                timeline_owned = ancestor;
@@ -2480,27 +2471,9 @@ impl Timeline {
        Ok(())
    }

-    async fn put_values(
-        &self,
-        values: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        // Pick the first LSN in the batch to get the layer to write to.
-        for lsns in values.values() {
-            if let Some((lsn, _)) = lsns.first() {
-                let layer = self.get_layer_for_write(*lsn).await?;
-                layer.put_values(values, ctx).await?;
-                break;
-            }
-        }
-        Ok(())
-    }
-
-    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        if let Some((_, lsn)) = tombstones.first() {
-            let layer = self.get_layer_for_write(*lsn).await?;
-            layer.put_tombstones(tombstones).await?;
-        }
+    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_tombstone(key_range, lsn).await?;
        Ok(())
    }

@@ -3062,15 +3035,6 @@ impl Timeline {
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
-                        if self.shard_identity.is_key_disposable(&key) {
-                            debug!(
-                                "Dropping key {} during compaction (it belongs on shard {:?})",
-                                key,
-                                self.shard_identity.get_shard_number(&key)
-                            );
-                            key = key.next();
-                            continue;
-                        }
                        let img = match self.get(key, lsn, ctx).await {
                            Ok(img) => img,
                            Err(err) => {
@@ -3097,7 +3061,6 @@ impl Timeline {
                                }
                            }
                        };
-
                        image_layer_writer.put_image(key, &img).await?;
                        key = key.next();
                    }
@@ -3668,15 +3631,7 @@ impl Timeline {
                )))
            });

-            if !self.shard_identity.is_key_disposable(&key) {
-                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
-            } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
-            }
+            writer.as_mut().unwrap().put_value(key, lsn, value).await?;

            if !new_layers.is_empty() {
                fail_point!("after-timeline-compacted-first-L1");
@@ -4231,7 +4186,7 @@ impl Timeline {
                    .context("Failed to reconstruct a page image:")
                {
                    Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
+                    Err(e) => return Err(PageReconstructError::from(e)),
                };

                if img.len() == page_cache::PAGE_SZ {
@@ -4574,16 +4529,8 @@ impl<'a> TimelineWriter<'a> {
        self.tl.put_value(key, lsn, value, ctx).await
    }

-    pub(crate) async fn put_batch(
-        &self,
-        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.tl.put_values(batch, ctx).await
-    }
-
-    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
-        self.tl.put_tombstones(batch).await
+    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        self.tl.put_tombstone(key_range, lsn).await
    }

    /// Track the end of the latest digested WAL record.
@@ -4594,11 +4541,11 @@ impl<'a> TimelineWriter<'a> {
    /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
    /// the latest and can be read.
-    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
+    pub fn finish_write(&self, new_lsn: Lsn) {
        self.tl.finish_write(new_lsn);
    }

-    pub(crate) fn update_current_logical_size(&self, delta: i64) {
+    pub fn update_current_logical_size(&self, delta: i64) {
        self.tl.update_current_logical_size(delta)
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -58,7 +58,6 @@ pub struct WalReceiverConf {
    pub max_lsn_wal_lag: NonZeroU64,
    pub auth_token: Option<Arc<String>>,
    pub availability_zone: Option<String>,
-    pub ingest_batch_size: u64,
 }

 pub struct WalReceiver {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -411,7 +411,6 @@ impl ConnectionManagerState {

        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
-        let ingest_batch_size = self.conf.ingest_batch_size;
        let timeline = Arc::clone(&self.timeline);
        let ctx = ctx.detached_child(
            TaskKind::WalReceiverConnectionHandler,
@@ -431,7 +430,6 @@ impl ConnectionManagerState {
                    connect_timeout,
                    ctx,
                    node_id,
-                    ingest_batch_size,
                )
                .await;

@@ -1347,7 +1345,6 @@ mod tests {
                max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
                auth_token: None,
                availability_zone: None,
-                ingest_batch_size: 1,
            },
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
    context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
@@ -106,7 +106,6 @@ impl From<WalDecodeError> for WalReceiverError {

 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
-#[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
    timeline: Arc<Timeline>,
    wal_source_connconf: PgConnectionConfig,
@@ -115,7 +114,6 @@ pub(super) async fn handle_walreceiver_connection(
    connect_timeout: Duration,
    ctx: RequestContext,
    node: NodeId,
-    ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -307,9 +305,7 @@ pub(super) async fn handle_walreceiver_connection(

                {
                    let mut decoded = DecodedWALRecord::default();
-                    let mut modification = timeline.begin_modification(startlsn);
-                    let mut uncommitted_records = 0;
-                    let mut filtered_records = 0;
+                    let mut modification = timeline.begin_modification(endlsn);
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
@@ -318,40 +314,14 @@ pub(super) async fn handle_walreceiver_connection(
                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                        }

-                        // Ingest the records without immediately committing them.
-                        let ingested = walingest
+                        walingest
                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                            .await
                            .with_context(|| format!("could not ingest record at {lsn}"))?;
-                        if !ingested {
-                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
-                            WAL_INGEST.records_filtered.inc();
-                            filtered_records += 1;
-                        }

                        fail_point!("walreceiver-after-ingest");

                        last_rec_lsn = lsn;
-
-                        // Commit every ingest_batch_size records. Even if we filtered out
-                        // all records, we still need to call commit to advance the LSN.
-                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size {
-                            WAL_INGEST
-                                .records_committed
-                                .inc_by(uncommitted_records - filtered_records);
-                            modification.commit(&ctx).await?;
-                            uncommitted_records = 0;
-                            filtered_records = 0;
-                        }
-                    }
-
-                    // Commit the remaining records.
-                    if uncommitted_records > 0 {
-                        WAL_INGEST
-                            .records_committed
-                            .inc_by(uncommitted_records - filtered_records);
-                        modification.commit(&ctx).await?;
                    }
                }

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,7 +22,6 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
-use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
@@ -36,11 +35,14 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
+use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};

+#[cfg(feature = "testing")]
+use pageserver_api::shard::TenantShardId;
+
 use crate::config::PageServerConf;
 use crate::metrics::{
    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -90,7 +92,7 @@ struct ProcessOutput {
 /// records.
 ///
 pub struct PostgresRedoManager {
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
    redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -184,13 +186,10 @@ impl PostgresRedoManager {
    ///
    /// Create a new PostgresRedoManager.
    ///
-    pub fn new(
-        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
-    ) -> PostgresRedoManager {
+    pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
        // The actual process is launched lazily, on first request.
        PostgresRedoManager {
-            tenant_shard_id,
+            tenant_id,
            conf,
            last_redo_at: std::sync::Mutex::default(),
            redo_process: RwLock::new(None),
@@ -245,12 +244,8 @@ impl PostgresRedoManager {
                                let timer =
                                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                let proc = Arc::new(
-                                    WalRedoProcess::launch(
-                                        self.conf,
-                                        self.tenant_shard_id,
-                                        pg_version,
-                                    )
-                                    .context("launch walredo process")?,
+                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
+                                        .context("launch walredo process")?,
                                );
                                timer.observe_duration();
                                *proc_guard = Some(Arc::clone(&proc));
@@ -414,11 +409,7 @@ impl PostgresRedoManager {
                    "ClearVisibilityMapFlags record on unexpected rel {}",
                    rel
                );
-
-                // Helper function to clear the VM bit corresponding to 'heap_blkno'.
-                // (The logic is similar to the guts of the visibilitymap_clear() function
-                // in PostgreSQL, after it has locked the right VM page.)
-                let mut visibilitymap_clear = |heap_blkno| {
+                if let Some(heap_blkno) = *new_heap_blkno {
                    // Calculate the VM block and offset that corresponds to the heap block.
                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
@@ -431,12 +422,19 @@ impl PostgresRedoManager {
                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];

                    map[map_byte as usize] &= !(flags << map_offset);
-                };
-                if let Some(heap_blkno) = *new_heap_blkno {
-                    visibilitymap_clear(heap_blkno);
                }
+
+                // Repeat for 'old_heap_blkno', if any
                if let Some(heap_blkno) = *old_heap_blkno {
-                    visibilitymap_clear(heap_blkno);
+                    let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
+                    let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
+                    let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
+
+                    assert!(map_block == blknum);
+
+                    let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+
+                    map[map_byte as usize] &= !(flags << map_offset);
                }
            }
            // Non-relational WAL records are handled here, with custom code that has the
@@ -640,7 +638,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 struct WalRedoProcess {
    #[allow(dead_code)]
    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
+    tenant_id: TenantId,
    // Some() on construction, only becomes None on Drop.
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
@@ -654,10 +652,10 @@ impl WalRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
    fn launch(
        conf: &'static PageServerConf,
-        tenant_shard_id: TenantShardId,
+        tenant_id: TenantId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -682,7 +680,7 @@ impl WalRedoProcess {
            // as close-on-exec by default, but that's not enough, since we use
            // libraries that directly call libc open without setting that flag.
            .close_fds()
-            .spawn_no_leak_child(tenant_shard_id)
+            .spawn_no_leak_child(tenant_id)
            .context("spawn process")?;
        WAL_REDO_PROCESS_COUNTERS.started.inc();
        let mut child = scopeguard::guard(child, |child| {
@@ -743,12 +741,12 @@ impl WalRedoProcess {
                        error!(error=?e, "failed to read from walredo stderr");
                    }
                }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
        );

        Ok(Self {
            conf,
-            tenant_shard_id,
+            tenant_id,
            child: Some(child),
            stdin: Mutex::new(ProcessInput {
                stdin,
@@ -774,7 +772,7 @@ impl WalRedoProcess {
    // Apply given WAL records ('records') over an old page image. Returns
    // new page image.
    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
    fn apply_wal_records(
        &self,
        tag: BufferTag,
@@ -968,7 +966,11 @@ impl WalRedoProcess {
        // these files will be collected to an allure report
        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());

-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
+        let path = self
+            .conf
+            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
+            .join(&filename);

        let res = std::fs::OpenOptions::new()
            .write(true)
@@ -1002,7 +1004,7 @@ impl Drop for WalRedoProcess {
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
-    tenant_id: TenantShardId,
+    tenant_id: TenantId,
    child: Option<Child>,
 }

@@ -1021,7 +1023,7 @@ impl DerefMut for NoLeakChild {
 }

 impl NoLeakChild {
-    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
+    fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
        let child = command.spawn()?;
        Ok(NoLeakChild {
            tenant_id,
@@ -1076,7 +1078,7 @@ impl Drop for NoLeakChild {
            Some(child) => child,
            None => return,
        };
-        let tenant_shard_id = self.tenant_id;
+        let tenant_id = self.tenant_id;
        // Offload the kill+wait of the child process into the background.
        // If someone stops the runtime, we'll leak the child process.
        // We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1084,11 +1086,7 @@ impl Drop for NoLeakChild {
            tokio::task::spawn_blocking(move || {
                // Intentionally don't inherit the tracing context from whoever is dropping us.
                // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!(
-                    "walredo",
-                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard_id = %tenant_shard_id.shard_slug()
-                );
+                let span = tracing::info_span!("walredo", %tenant_id);
                let _entered = span.enter();
                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
            })
@@ -1098,11 +1096,11 @@ impl Drop for NoLeakChild {
 }

 trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
 }

 impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
        NoLeakChild::spawn(tenant_id, self)
    }
 }
@@ -1157,7 +1155,6 @@ mod tests {
    use crate::repository::Key;
    use crate::{config::PageServerConf, walrecord::NeonWalRecord};
    use bytes::Bytes;
-    use pageserver_api::shard::TenantShardId;
    use std::str::FromStr;
    use utils::{id::TenantId, lsn::Lsn};

@@ -1267,9 +1264,9 @@ mod tests {
            let repo_dir = camino_tempfile::tempdir()?;
            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
            let conf = Box::leak(Box::new(conf));
-            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+            let tenant_id = TenantId::generate();

-            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+            let manager = PostgresRedoManager::new(conf, tenant_id);

            Ok(RedoHarness {
                _repo_dir: repo_dir,
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,7 +9,6 @@ OBJS = \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
-	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,8 +35,7 @@

 #define PageStoreTrace DEBUG5

-#define MIN_RECONNECT_INTERVAL_USEC 1000
-#define MAX_RECONNECT_INTERVAL_USEC 1000000
+#define RECONNECT_INTERVAL_USEC 1000000

 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;
@@ -134,11 +133,6 @@ pageserver_connect(int elevel)
 	const char *values[3];
 	int			n;

-	static TimestampTz last_connect_time = 0;
-	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
-	TimestampTz now;
-        uint64_t us_since_last_connect;
-
 	Assert(!connected);

 	if (CheckConnstringUpdated())
@@ -146,22 +140,6 @@ pageserver_connect(int elevel)
 		ReloadConnstring();
 	}

-	now = GetCurrentTimestamp();
-        us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < delay_us)
-	{
-		pg_usleep(delay_us - us_since_last_connect);
-		delay_us *= 2;
-		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
-			delay_us = MAX_RECONNECT_INTERVAL_USEC;
-		last_connect_time = GetCurrentTimestamp();
-	}
-	else
-	{
-		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-		last_connect_time = now;
-	}
-
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -355,6 +333,7 @@ pageserver_send(NeonRequest *request)
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
+			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
 		n_reconnect_attempts = 0;
 	}
--- a/pgxn/neon/libpqwalproposer.h
+++ b/pgxn/neon/libpqwalproposer.h
@@ -1,96 +0,0 @@
-/*
- * Interface to set of libpq wrappers walproposer and neon_walreader need.
- * Similar to libpqwalreceiver, but it has blocking connection establishment and
- * pqexec which don't fit us. Implementation is at walproposer_pg.c.
- */
-#ifndef ___LIBPQWALPROPOSER_H__
-#define ___LIBPQWALPROPOSER_H__
-
-/* Re-exported and modified ExecStatusType */
-typedef enum
-{
-	/* We received a single CopyBoth result */
-	WP_EXEC_SUCCESS_COPYBOTH,
-
-	/*
-	 * Any success result other than a single CopyBoth was received. The
-	 * specifics of the result were already logged, but it may be useful to
-	 * provide an error message indicating which safekeeper messed up.
-	 *
-	 * Do not expect PQerrorMessage to be appropriately set.
-	 */
-	WP_EXEC_UNEXPECTED_SUCCESS,
-
-	/*
-	 * No result available at this time. Wait until read-ready, then call
-	 * again. Internally, this is returned when PQisBusy indicates that
-	 * PQgetResult would block.
-	 */
-	WP_EXEC_NEEDS_INPUT,
-	/* Catch-all failure. Check PQerrorMessage. */
-	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
-
-/* Possible return values from walprop_async_read */
-typedef enum
-{
-	/* The full read was successful. buf now points to the data */
-	PG_ASYNC_READ_SUCCESS,
-
-	/*
-	 * The read is ongoing. Wait until the connection is read-ready, then try
-	 * again.
-	 */
-	PG_ASYNC_READ_TRY_AGAIN,
-	/* Reading failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
-
-/* Possible return values from walprop_async_write */
-typedef enum
-{
-	/* The write fully completed */
-	PG_ASYNC_WRITE_SUCCESS,
-
-	/*
-	 * The write started, but you'll need to call PQflush some more times to
-	 * finish it off. We just tried, so it's best to wait until the connection
-	 * is read- or write-ready to try again.
-	 *
-	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
-	 * becomes write-ready, just call PQflush.
-	 */
-	PG_ASYNC_WRITE_TRY_FLUSH,
-	/* Writing failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
-
-/*
- * This header is included by walproposer.h to define walproposer_api; if we're
- * building walproposer without pg, ignore libpq part, leaving only interface
- * types.
- */
-#ifndef WALPROPOSER_LIB
-
-#include "libpq-fe.h"
-
-/*
- * Sometimes working directly with underlying PGconn is simpler, export the
- * whole thing for simplicity.
- */
-typedef struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received CopyData message from
-								 * walprop_async_read */
-} WalProposerConn;
-
-extern WalProposerConn *libpqwp_connect_start(char *conninfo);
-extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
-extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
-extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
-extern void libpqwp_disconnect(WalProposerConn *conn);
-
-#endif							/* WALPROPOSER_LIB */
-#endif							/* ___LIBPQWALPROPOSER_H__ */
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -1,742 +0,0 @@
-/*
- * Like WALRead, but when WAL segment doesn't exist locally instead of throwing
- * ERROR asynchronously tries to fetch it from the most advanced safekeeper.
- *
- * We can't use libpqwalreceiver as it blocks during connection establishment
- * (and waiting for PQExec result), so use libpqwalproposer instead.
- *
- * TODO: keepalives are currently never sent, so the other side can close the
- * connection prematurely.
- *
- * TODO: close conn if reading takes too long to prevent stuck connections.
- */
-#include "postgres.h"
-
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
-#include "access/xlogreader.h"
-#include "libpq/pqformat.h"
-#include "storage/fd.h"
-#include "utils/wait_event.h"
-
-#include "libpq-fe.h"
-
-#include "neon_walreader.h"
-#include "walproposer.h"
-
-#define NEON_WALREADER_ERR_MSG_LEN 512
-
-/*
- * Can be called where NeonWALReader *state is available in the context, adds log_prefix.
- */
-#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
-
-static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
-static void NeonWALReaderResetRemote(NeonWALReader *state);
-static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
-static void neon_wal_segment_close(NeonWALReader *state);
-static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
-								  TimeLineID tli);
-
-/*
- * State of connection to donor safekeeper.
- */
-typedef enum
-{
-	/* no remote connection */
-	RS_NONE,
-	/* doing PQconnectPoll, need readable socket */
-	RS_CONNECTING_READ,
-	/* doing PQconnectPoll, need writable socket */
-	RS_CONNECTING_WRITE,
-	/* Waiting for START_REPLICATION result */
-	RS_WAIT_EXEC_RESULT,
-	/* replication stream established */
-	RS_ESTABLISHED,
-} NeonWALReaderRemoteState;
-
-struct NeonWALReader
-{
-	/*
-	 * LSN before which we assume WAL is not available locally. Exists because
-	 * though first segment after startup always exists, part before
-	 * basebackup LSN is filled with zeros.
-	 */
-	XLogRecPtr	available_lsn;
-	WALSegmentContext segcxt;
-	WALOpenSegment seg;
-	int			wre_errno;
-	/* Explains failure to read, static for simplicity. */
-	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];
-
-	/*
-	 * Saved info about request in progress, used to check validity of
-	 * arguments after resume and remember how far we accomplished it. req_lsn
-	 * is 0 if there is no request in progress.
-	 */
-	XLogRecPtr	req_lsn;
-	Size		req_len;
-	Size		req_progress;
-	WalProposer *wp;			/* we learn donor through walproposer */
-	char		donor_name[64]; /* saved donor safekeeper name for logging */
-	/* state of connection to safekeeper */
-	NeonWALReaderRemoteState rem_state;
-	WalProposerConn *wp_conn;
-
-	/*
-	 * position in wp_conn recvbuf from which we'll copy WAL next time, or
-	 * NULL if there is no unprocessed message
-	 */
-	char	   *wal_ptr;
-	Size		wal_rem_len;	/* how many unprocessed bytes left in recvbuf */
-
-	/*
-	 * LSN of wal_ptr position according to walsender to cross check against
-	 * read request
-	 */
-	XLogRecPtr	rem_lsn;
-
-	/* prepended to lines logged by neon_walreader, if provided */
-	char		log_prefix[64];
-};
-
-/* palloc and initialize NeonWALReader */
-NeonWALReader *
-NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
-{
-	NeonWALReader *reader;
-
-	reader = (NeonWALReader *)
-		palloc_extended(sizeof(NeonWALReader),
-						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
-	if (!reader)
-		return NULL;
-
-	reader->available_lsn = available_lsn;
-	reader->seg.ws_file = -1;
-	reader->seg.ws_segno = 0;
-	reader->seg.ws_tli = 0;
-	reader->segcxt.ws_segsize = wal_segment_size;
-
-	reader->wp = wp;
-
-	reader->rem_state = RS_NONE;
-
-	if (log_prefix)
-		strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
-
-	return reader;
-}
-
-void
-NeonWALReaderFree(NeonWALReader *state)
-{
-	if (state->seg.ws_file != -1)
-		neon_wal_segment_close(state);
-	if (state->wp_conn)
-		libpqwp_disconnect(state->wp_conn);
-	pfree(state);
-}
-
-/*
- * Like vanilla WALRead, but if requested position is before available_lsn or
- * WAL segment doesn't exist on disk, it tries to fetch needed segment from the
- * advanced safekeeper.
- *
- * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
- * fetched from timeline 'tli'.
- *
- * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
- * occurs, in which case 'err' has the desciption. Error always closes remote
- * connection, if there was any, so socket subscription should be removed.
- *
- * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
- * NeonWALReaderSocket and call NeonWALRead again with exactly the same
- * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
- * docs during connection establishment (before first successful read) socket
- * underneath might change.
- *
- * Also, eventually walreader should switch from remote to local read; caller
- * should remove subscription to socket then by checking NeonWALReaderEvents
- * after successful read (otherwise next read might reopen the connection with
- * different socket).
- *
- * Reading not monotonically is not supported and will result in error.
- *
- * Caller should be sure that WAL up to requested LSN exists, otherwise
- * NEON_WALREAD_WOULDBLOCK might be always returned.
- */
-NeonWALReadResult
-NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	/*
-	 * If requested data is before known available basebackup lsn or there is
-	 * already active remote state, do remote read.
-	 */
-	if (startptr < state->available_lsn || state->rem_state != RS_NONE)
-	{
-		return NeonWALReadRemote(state, buf, startptr, count, tli);
-	}
-	if (NeonWALReadLocal(state, buf, startptr, count, tli))
-	{
-		return NEON_WALREAD_SUCCESS;
-	}
-	else if (state->wre_errno == ENOENT)
-	{
-		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
-				LSN_FORMAT_ARGS(startptr));
-		return NeonWALReadRemote(state, buf, startptr, count, tli);
-	}
-	else
-	{
-		return NEON_WALREAD_ERROR;
-	}
-}
-
-/* Do the read from remote safekeeper. */
-static NeonWALReadResult
-NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	if (state->rem_state == RS_NONE)
-	{
-		XLogRecPtr	donor_lsn;
-
-		/* no connection yet; start one */
-		Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
-
-		if (donor == NULL)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "failed to establish remote connection to fetch WAL: no donor available");
-			return NEON_WALREAD_ERROR;
-		}
-		snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
-		nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
-				state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
-		state->wp_conn = libpqwp_connect_start(donor->conninfo);
-		if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "failed to connect to %s to fetch WAL: immediately failed with %s",
-					 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-		/* we'll poll immediately */
-		state->rem_state = RS_CONNECTING_READ;
-	}
-
-	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
-	{
-		switch (PQconnectPoll(state->wp_conn->pg_conn))
-		{
-			case PGRES_POLLING_FAILED:
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "failed to connect to %s to fetch WAL: poll error: %s",
-						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-			case PGRES_POLLING_READING:
-				state->rem_state = RS_CONNECTING_READ;
-				return NEON_WALREAD_WOULDBLOCK;
-			case PGRES_POLLING_WRITING:
-				state->rem_state = RS_CONNECTING_WRITE;
-				return NEON_WALREAD_WOULDBLOCK;
-			case PGRES_POLLING_OK:
-				{
-					/* connection successfully established */
-					char		start_repl_query[128];
-
-					snprintf(start_repl_query, sizeof(start_repl_query),
-							 "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
-							 LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
-					nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
-							state->donor_name, start_repl_query);
-					if (!libpqwp_send_query(state->wp_conn, start_repl_query))
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "failed to send %s query to %s: %s",
-								 start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-						NeonWALReaderResetRemote(state);
-						return NEON_WALREAD_ERROR;
-					}
-					state->rem_state = RS_WAIT_EXEC_RESULT;
-					break;
-				}
-
-			default:			/* there is unused PGRES_POLLING_ACTIVE */
-				Assert(false);
-				return NEON_WALREAD_ERROR;	/* keep the compiler quiet */
-		}
-	}
-
-	if (state->rem_state == RS_WAIT_EXEC_RESULT)
-	{
-		switch (libpqwp_get_query_result(state->wp_conn))
-		{
-			case WP_EXEC_SUCCESS_COPYBOTH:
-				state->rem_state = RS_ESTABLISHED;
-				break;
-			case WP_EXEC_NEEDS_INPUT:
-				return NEON_WALREAD_WOULDBLOCK;
-			case WP_EXEC_FAILED:
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "get START_REPLICATION result from %s failed: %s",
-						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-			default:			/* can't happen */
-				snprintf(state->err_msg, sizeof(state->err_msg),
-						 "get START_REPLICATION result from %s: unexpected result",
-						 state->donor_name);
-				NeonWALReaderResetRemote(state);
-				return NEON_WALREAD_ERROR;
-		}
-	}
-
-	Assert(state->rem_state == RS_ESTABLISHED);
-
-	/*
-	 * If we had the request before, verify args are the same and advance the
-	 * result ptr according to the progress; otherwise register the request.
-	 */
-	if (state->req_lsn != InvalidXLogRecPtr)
-	{
-		if (state->req_lsn != startptr || state->req_len != count)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "args changed during request, was %X/%X %zu, now %X/%X %zu",
-					 LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-		nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
-				LSN_FORMAT_ARGS(startptr),
-				count,
-				state->req_progress);
-		buf += state->req_progress;
-	}
-	else
-	{
-		state->req_lsn = startptr;
-		state->req_len = count;
-		state->req_progress = 0;
-		nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
-				LSN_FORMAT_ARGS(startptr),
-				count);
-	}
-
-	while (true)
-	{
-		Size		to_copy;
-
-		/*
-		 * If we have no ready data, receive new message.
-		 */
-		if (state->wal_rem_len == 0 &&
-
-		/*
-		 * check for the sake of 0 length reads; walproposer does these for
-		 * heartbeats, though generally they shouldn't hit remote source.
-		 */
-			state->req_len - state->req_progress > 0)
-		{
-			NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
-
-			if (read_msg_res != NEON_WALREAD_SUCCESS)
-				return read_msg_res;
-		}
-
-		if (state->req_lsn + state->req_progress != state->rem_lsn)
-		{
-			snprintf(state->err_msg, sizeof(state->err_msg),
-					 "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
-					 LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
-					 LSN_FORMAT_ARGS(state->rem_lsn),
-					 LSN_FORMAT_ARGS(state->req_lsn),
-					 state->req_len);
-			NeonWALReaderResetRemote(state);
-			return NEON_WALREAD_ERROR;
-		}
-
-		/* We can copy min of (available, requested) bytes. */
-		to_copy =
-			Min(state->req_len - state->req_progress, state->wal_rem_len);
-		memcpy(buf, state->wal_ptr, to_copy);
-		state->wal_ptr += to_copy;
-		state->wal_rem_len -= to_copy;
-		state->rem_lsn += to_copy;
-		if (state->wal_rem_len == 0)
-			state->wal_ptr = NULL;	/* freed by libpqwalproposer */
-		buf += to_copy;
-		state->req_progress += to_copy;
-		if (state->req_progress == state->req_len)
-		{
-			XLogSegNo	next_segno;
-			XLogSegNo	req_segno;
-
-			XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
-			XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
-
-			/*
-			 * Request completed. If there is a chance of serving next one
-			 * locally, close the connection.
-			 */
-			if (state->req_lsn < state->available_lsn &&
-				state->rem_lsn >= state->available_lsn)
-			{
-				nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
-						LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
-				NeonWALReaderResetRemote(state);
-			}
-			else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
-					 is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
-			{
-				nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
-						LSN_FORMAT_ARGS(state->rem_lsn));
-				NeonWALReaderResetRemote(state);
-			}
-			state->req_lsn = InvalidXLogRecPtr;
-			state->req_len = 0;
-			state->req_progress = 0;
-			return NEON_WALREAD_SUCCESS;
-		}
-	}
-}
-
-/*
- * Read one WAL message from the stream, sets state->wal_ptr in case of success.
- * Resets remote state in case of failure.
- */
-static NeonWALReadResult
-NeonWALReaderReadMsg(NeonWALReader *state)
-{
-	while (true)				/* loop until we get 'w' */
-	{
-		char	   *copydata_ptr;
-		int			copydata_size;
-		StringInfoData s;
-		char		msg_type;
-		int			hdrlen;
-
-		Assert(state->rem_state == RS_ESTABLISHED);
-		Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
-
-		switch (libpqwp_async_read(state->wp_conn,
-								   &copydata_ptr,
-								   &copydata_size))
-		{
-			case PG_ASYNC_READ_SUCCESS:
-				break;
-			case PG_ASYNC_READ_TRY_AGAIN:
-				return NEON_WALREAD_WOULDBLOCK;
-			case PG_ASYNC_READ_FAIL:
-				snprintf(state->err_msg,
-						 sizeof(state->err_msg),
-						 "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
-						 LSN_FORMAT_ARGS(state->req_lsn),
-						 state->req_len,
-						 state->req_progress,
-						 PQerrorMessage(state->wp_conn->pg_conn));
-				goto err;
-		}
-
-		/* put data on StringInfo to parse */
-		s.data = copydata_ptr;
-		s.len = copydata_size;
-		s.cursor = 0;
-		s.maxlen = -1;
-
-		if (copydata_size == 0)
-		{
-			snprintf(state->err_msg,
-					 sizeof(state->err_msg),
-					 "zero length copydata received");
-			goto err;
-		}
-		msg_type = pq_getmsgbyte(&s);
-		switch (msg_type)
-		{
-			case 'w':
-				{
-					XLogRecPtr	start_lsn;
-
-					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
-					if (s.len - s.cursor < hdrlen)
-					{
-						snprintf(state->err_msg,
-								 sizeof(state->err_msg),
-								 "invalid WAL message received from primary");
-						goto err;
-					}
-
-					start_lsn = pq_getmsgint64(&s);
-					pq_getmsgint64(&s); /* XLogRecPtr	end_lsn; */
-					pq_getmsgint64(&s); /* TimestampTz send_time */
-
-					state->rem_lsn = start_lsn;
-					state->wal_rem_len = (Size) (s.len - s.cursor);
-					state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
-					nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
-							LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
-
-					return NEON_WALREAD_SUCCESS;
-				}
-			case 'k':
-				{
-					XLogRecPtr	end_lsn;
-					bool		reply_requested;
-
-					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
-					if (s.len - s.cursor < hdrlen)
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "invalid keepalive message received from primary");
-						goto err;
-					}
-
-					end_lsn = pq_getmsgint64(&s);
-					pq_getmsgint64(&s); /* TimestampTz timestamp; */
-					reply_requested = pq_getmsgbyte(&s);
-					nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
-							LSN_FORMAT_ARGS(end_lsn),
-							reply_requested);
-					if (end_lsn < state->req_lsn + state->req_len)
-					{
-						snprintf(state->err_msg, sizeof(state->err_msg),
-								 "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
-								 LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
-						goto err;
-					}
-					continue;
-				}
-			default:
-				nwr_log(WARNING, "invalid replication message type %d", msg_type);
-				continue;
-		}
-	}
-err:
-	NeonWALReaderResetRemote(state);
-	return NEON_WALREAD_ERROR;
-}
-
-/* reset remote connection and request in progress */
-static void
-NeonWALReaderResetRemote(NeonWALReader *state)
-{
-	state->req_lsn = InvalidXLogRecPtr;
-	state->req_len = 0;
-	state->req_progress = 0;
-	state->rem_state = RS_NONE;
-	if (state->wp_conn)
-	{
-		libpqwp_disconnect(state->wp_conn);
-		state->wp_conn = NULL;
-	}
-	state->donor_name[0] = '\0';
-	state->wal_ptr = NULL;
-	state->wal_rem_len = 0;
-	state->rem_lsn = InvalidXLogRecPtr;
-}
-
-/*
- * Return socket of connection to remote source. Must be called only when
- * connection exists (NeonWALReaderEvents returns non zero).
- */
-pgsocket
-NeonWALReaderSocket(NeonWALReader *state)
-{
-	if (!state->wp_conn)
-		nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
-	return PQsocket(state->wp_conn->pg_conn);
-}
-
-/*
- * Whether remote connection is established. Once this is done, until successful
- * local read or error socket is stable and user can update socket events
- * instead of readding it each time.
- */
-bool
-NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
-{
-	return state->rem_state == RS_ESTABLISHED;
-}
-
-/*
- * Returns events user should wait on connection socket or 0 if remote
- * connection is not active.
- */
-extern uint32
-NeonWALReaderEvents(NeonWALReader *state)
-{
-	switch (state->rem_state)
-	{
-		case RS_NONE:
-			return 0;
-		case RS_CONNECTING_READ:
-			return WL_SOCKET_READABLE;
-		case RS_CONNECTING_WRITE:
-			return WL_SOCKET_WRITEABLE;
-		case RS_WAIT_EXEC_RESULT:
-		case RS_ESTABLISHED:
-			return WL_SOCKET_READABLE;
-		default:
-			Assert(false);
-			return 0;			/* make compiler happy */
-	}
-}
-
-static bool
-NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
-{
-	char	   *p;
-	XLogRecPtr	recptr;
-	Size		nbytes;
-
-	p = buf;
-	recptr = startptr;
-	nbytes = count;
-
-	while (nbytes > 0)
-	{
-		uint32		startoff;
-		int			segbytes;
-		int			readbytes;
-
-		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
-
-		/*
-		 * If the data we want is not in a segment we have open, close what we
-		 * have (if anything) and open the next one, using the caller's
-		 * provided openSegment callback.
-		 */
-		if (state->seg.ws_file < 0 ||
-			!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
-			tli != state->seg.ws_tli)
-		{
-			XLogSegNo	nextSegNo;
-
-			neon_wal_segment_close(state);
-
-			XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
-			if (!neon_wal_segment_open(state, nextSegNo, &tli))
-			{
-				char		fname[MAXFNAMELEN];
-
-				state->wre_errno = errno;
-
-				XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
-				snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
-						 fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
-				return false;
-			}
-
-			/* This shouldn't happen -- indicates a bug in segment_open */
-			Assert(state->seg.ws_file >= 0);
-
-			/* Update the current segment info. */
-			state->seg.ws_tli = tli;
-			state->seg.ws_segno = nextSegNo;
-		}
-
-		/* How many bytes are within this segment? */
-		if (nbytes > (state->segcxt.ws_segsize - startoff))
-			segbytes = state->segcxt.ws_segsize - startoff;
-		else
-			segbytes = nbytes;
-
-#ifndef FRONTEND
-		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
-#endif
-
-		/* Reset errno first; eases reporting non-errno-affecting errors */
-		errno = 0;
-		readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
-
-#ifndef FRONTEND
-		pgstat_report_wait_end();
-#endif
-
-		if (readbytes <= 0)
-		{
-			char		fname[MAXFNAMELEN];
-
-			XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
-
-			if (readbytes < 0)
-			{
-				state->wre_errno = errno;
-				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
-						 fname, startoff, strerror(state->wre_errno));
-			}
-			else
-			{
-				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
-						 fname, startoff);
-			}
-			return false;
-		}
-
-		/* Update state for read */
-		recptr += readbytes;
-		nbytes -= readbytes;
-		p += readbytes;
-	}
-
-	return true;
-}
-
-/*
- * Copy of vanilla wal_segment_open, but returns false in case of error instead
- * of ERROR, with errno set.
- *
- * XLogReaderRoutine->segment_open callback for local pg_wal files
- */
-static bool
-neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
-					  TimeLineID *tli_p)
-{
-	TimeLineID	tli = *tli_p;
-	char		path[MAXPGPATH];
-
-	XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
-	nwr_log(DEBUG5, "opening %s", path);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return true;
-
-	return false;
-}
-
-static bool
-is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
-{
-	struct stat stat_buffer;
-	char		path[MAXPGPATH];
-
-	XLogFilePath(path, tli, segno, segsize);
-	return stat(path, &stat_buffer) == 0;
-}
-
-/* copy of vanilla wal_segment_close with NeonWALReader */
-static void
-neon_wal_segment_close(NeonWALReader *state)
-{
-	if (state->seg.ws_file >= 0)
-	{
-		close(state->seg.ws_file);
-		/* need to check errno? */
-		state->seg.ws_file = -1;
-	}
-}
-
-char *
-NeonWALReaderErrMsg(NeonWALReader *state)
-{
-	return state->err_msg;
-}
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -1,30 +0,0 @@
-#ifndef __NEON_WALREADER_H__
-#define __NEON_WALREADER_H__
-
-#include "access/xlogdefs.h"
-
-/* forward declare so we don't have to expose the struct to the public */
-struct NeonWALReader;
-typedef struct NeonWALReader NeonWALReader;
-
-/* avoid including walproposer.h as it includes us */
-struct WalProposer;
-typedef struct WalProposer WalProposer;
-
-/* NeonWALRead return value */
-typedef enum
-{
-	NEON_WALREAD_SUCCESS,
-	NEON_WALREAD_WOULDBLOCK,
-	NEON_WALREAD_ERROR,
-} NeonWALReadResult;
-
-extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
-extern void NeonWALReaderFree(NeonWALReader *state);
-extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
-extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
-extern uint32 NeonWALReaderEvents(NeonWALReader *state);
-extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
-extern char *NeonWALReaderErrMsg(NeonWALReader *state);
-
-#endif							/* __NEON_WALREADER_H__ */
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -8,9 +8,6 @@
 #include "replication/walreceiver.h"
 #include "utils/uuid.h"

-#include "libpqwalproposer.h"
-#include "neon_walreader.h"
-
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2

@@ -23,9 +20,43 @@
 */
 #define WL_NO_EVENTS 0

-struct WalProposerConn;			/* Defined in libpqwalproposer.h */
+struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
 typedef struct WalProposerConn WalProposerConn;

+/* Possible return values from ReadPGAsync */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+
+	/*
+	 * The read is ongoing. Wait until the connection is read-ready, then try
+	 * again.
+	 */
+	PG_ASYNC_READ_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from WritePGAsync */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+
+	/*
+	 * The write started, but you'll need to call PQflush some more times to
+	 * finish it off. We just tried, so it's best to wait until the connection
+	 * is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
 /*
 * WAL safekeeper state, which is used to wait for some event.
 *
@@ -102,40 +133,6 @@ typedef enum
 	SS_ACTIVE,
 } SafekeeperState;

-/*
- * Sending WAL substates of SS_ACTIVE.
- */
-typedef enum
-{
-	/*
-	 * We are ready to send more WAL, waiting for latch set to learn about
-	 * more WAL becoming available (or just a timeout to send heartbeat).
-	 */
-	SS_ACTIVE_SEND,
-
-	/*
-	 * Polling neon_walreader to receive chunk of WAL (probably remotely) to
-	 * send to this safekeeper.
-	 *
-	 * Note: socket management is done completely inside walproposer_pg for
-	 * simplicity, and thus simulation doesn't test it. Which is fine as
-	 * simulation is mainly aimed at consensus checks, not waiteventset
-	 * management.
-	 *
-	 * Also, while in this state we don't touch safekeeper socket, so in
-	 * theory it might close connection as inactive. This can be addressed if
-	 * needed; however, while fetching WAL we should regularly send it, so the
-	 * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
-	 * walreader socket), but similarly shouldn't be a problem.
-	 */
-	SS_ACTIVE_READ_WAL,
-
-	/*
-	 * Waiting for write readiness to flush the socket.
-	 */
-	SS_ACTIVE_FLUSH,
-} SafekeeperActiveState;
-
 /* Consensus logical timestamp. */
 typedef uint64 term_t;

@@ -344,11 +341,12 @@ typedef struct Safekeeper
 	 */
 	XLogRecPtr	startStreamingAt;

+	bool		flushWrite;		/* set to true if we need to call AsyncFlush,*
+								 * to flush pending messages */
 	XLogRecPtr	streamingAt;	/* current streaming position */
 	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */

 	SafekeeperState state;		/* safekeeper state machine state */
-	SafekeeperActiveState active_state;
 	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
@@ -369,27 +367,12 @@ typedef struct Safekeeper
 	/*
 	 * WAL reader, allocated for each safekeeper.
 	 */
-	NeonWALReader *xlogreader;
+	XLogReaderState *xlogreader;

 	/*
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
-
-	/*
-	 * Neon WAL reader position in wait event set, or -1 if no socket. Note
-	 * that event must be removed not only on error/failure, but also on
-	 * successful *local* read, as next read might again be remote, but with
-	 * different socket.
-	 */
-	int			nwrEventPos;
-
-	/*
-	 * Per libpq docs, during connection establishment socket might change,
-	 * remember here if it is stable to avoid readding to the event set if
-	 * possible. Must be reset whenever nwr event is deleted.
-	 */
-	bool		nwrConnEstablished;
 #endif


@@ -418,6 +401,31 @@ typedef enum
 	 */
 } WalProposerConnectPollStatusType;

+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+
+	/*
+	 * Any success result other than a single CopyBoth was received. The
+	 * specifics of the result were already logged, but it may be useful to
+	 * provide an error message indicating which safekeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set.
+	 */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+
+	/*
+	 * No result available at this time. Wait until read-ready, then call
+	 * again. Internally, this is returned when PQisBusy indicates that
+	 * PQgetResult would block.
+	 */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
 /* Re-exported ConnStatusType */
 typedef enum
 {
@@ -478,7 +486,7 @@ typedef struct walproposer_api
 	/* Flush buffer to the network, aka PQflush. */
 	int			(*conn_flush) (Safekeeper *sk);

-	/* Reset sk state: close pq connection, deallocate xlogreader. */
+	/* Close the connection, aka PQfinish. */
 	void		(*conn_finish) (Safekeeper *sk);

 	/*
@@ -495,20 +503,17 @@ typedef struct walproposer_api
 	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
 	bool		(*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);

-	/*
-	 * Download WAL before basebackup for logical walsenders from sk, if
-	 * needed
-	 */
-	bool		(*recovery_download) (WalProposer *wp, Safekeeper *sk);
+	/* Download WAL from startpos to endpos and make it available locally. */
+	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+
+	/* Read WAL from disk to buf. */
+	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);

 	/* Allocate WAL reader. */
 	void		(*wal_reader_allocate) (Safekeeper *sk);

-	/* Read WAL from disk to buf. */
-	NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg);
-
-	/* Returns events to be awaited on WAL reader, if any. */
-	uint32		(*wal_reader_events) (Safekeeper *sk);
+	/* Deallocate event set. */
+	void		(*free_event_set) (WalProposer *wp);

 	/* Initialize event set. */
 	void		(*init_event_set) (WalProposer *wp);
@@ -516,15 +521,9 @@ typedef struct walproposer_api
 	/* Update events for an existing safekeeper connection. */
 	void		(*update_event_set) (Safekeeper *sk, uint32 events);

-	/* Configure wait event set for yield in SS_ACTIVE. */
-	void		(*active_state_update_event_set) (Safekeeper *sk);
-
 	/* Add a new safekeeper connection to the event set. */
 	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);

-	/* Remove safekeeper connection from event set */
-	void		(*rm_safekeeper_event_set) (Safekeeper *sk);
-
 	/*
 	 * Wait until some event happens: - timeout is reached - socket event for
 	 * safekeeper connection - new WAL is available
@@ -557,12 +556,26 @@ typedef struct walproposer_api
 	 */
 	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);

+	/*
+	 * Called on peer_horizon_lsn updates. Used to advance replication slot
+	 * and to free up disk space by deleting unnecessary WAL.
+	 */
+	void		(*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
+
 	/*
 	 * Write a log message to the internal log processor. This is used only
 	 * when walproposer is compiled as a library. Otherwise, all logging is
 	 * handled by elog().
 	 */
 	void		(*log_internal) (WalProposer *wp, int level, const char *line);
+
+	/*
+	 * Called right after the proposer was elected, but before it started
+	 * recovery and sent ProposerElected message to the safekeepers.
+	 *
+	 * Used by logical replication to update truncateLsn.
+	 */
+	void		(*after_election) (WalProposer *wp);
 } walproposer_api;

 /*
@@ -696,34 +709,15 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
 extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);

-/*
- * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
- * recreate set from scratch, hence the export.
- */
-extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
-extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
-

 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */

-#define WP_LOG_PREFIX "[WP] "
-
-/*
- * wp_log is used in pure wp code (walproposer.c), allowing API callback to
- * catch logging.
- */
 #ifdef WALPROPOSER_LIB
 extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
-#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
+#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
 #else
-#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
+#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
 #endif

-/*
- * And wpg_log is used all other (postgres specific) walproposer code, just
- * adding prefix.
- */
-#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
-
 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -12,7 +12,6 @@
 #include <unistd.h>
 #include <sys/stat.h>
 #include "access/xact.h"
-#include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "access/xlogutils.h"
 #include "access/xloginsert.h"
@@ -44,19 +43,14 @@
 #include "utils/ps_status.h"
 #include "utils/timestamp.h"

-#include "libpq-fe.h"
-
-#include "libpqwalproposer.h"
 #include "neon.h"
-#include "neon_walreader.h"
 #include "walproposer.h"
+#include "libpq-fe.h"

 #define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
 								 * message header */

-#define MB ((XLogRecPtr)1024 * 1024)
-
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"

 char	   *wal_acceptors_list = "";
@@ -97,12 +91,6 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
 static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);

-static void add_nwr_event_set(Safekeeper *sk, uint32 events);
-static void update_nwr_event_set(Safekeeper *sk, uint32 events);
-static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
-
-static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
-
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -226,6 +214,7 @@ backpressure_lag_impl(void)
 		XLogRecPtr	myFlushLsn = GetFlushRecPtr();
 #endif
 		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
+#define MB ((XLogRecPtr)1024 * 1024)

 		elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
 			 LSN_FORMAT_ARGS(myFlushLsn),
@@ -424,8 +413,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;

-	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
-			LSN_FORMAT_ARGS(startpos));
+	elog(LOG, "WAL proposer starts streaming at %X/%X",
+		 LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = wp->greetRequest.timeline;
 	cmd.startpoint = startpos;
@@ -549,9 +538,17 @@ walprop_pg_load_libpqwalreceiver(void)
 {
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
-		wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
+		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
 }

+/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
+struct WalProposerConn
+{
+	PGconn	   *pg_conn;
+	bool		is_nonblocking; /* whether the connection is non-blocking */
+	char	   *recvbuf;		/* last received data from walprop_async_read */
+};
+
 /* Helper function */
 static bool
 ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -589,17 +586,16 @@ walprop_status(Safekeeper *sk)
 	}
 }

-WalProposerConn *
-libpqwp_connect_start(char *conninfo)
+static void
+walprop_connect_start(Safekeeper *sk)
 {
-
 	PGconn	   *pg_conn;
-	WalProposerConn *conn;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
 	char	   *password = neon_auth_token;

+	Assert(sk->conn == NULL);

 	/*
 	 * Connect using the given connection string. If the NEON_AUTH_TOKEN
@@ -618,7 +614,7 @@ libpqwp_connect_start(char *conninfo)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = conninfo;
+	values[n] = sk->conninfo;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -630,7 +626,7 @@ libpqwp_connect_start(char *conninfo)
 	 * PGconn structure"
 	 */
 	if (!pg_conn)
-		wpg_log(FATAL, "failed to allocate new PGconn object");
+		elog(FATAL, "failed to allocate new PGconn object");

 	/*
 	 * And in theory this allocation can fail as well, but it's incredibly
@@ -639,20 +635,11 @@ libpqwp_connect_start(char *conninfo)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	conn = palloc(sizeof(WalProposerConn));
-	conn->pg_conn = pg_conn;
-	conn->is_nonblocking = false;	/* connections always start in blocking
-									 * mode */
-	conn->recvbuf = NULL;
-	return conn;
-}
-
-static void
-walprop_connect_start(Safekeeper *sk)
-{
-	Assert(sk->conn == NULL);
-	sk->conn = libpqwp_connect_start(sk->conninfo);
-
+	sk->conn = palloc(sizeof(WalProposerConn));
+	sk->conn->pg_conn = pg_conn;
+	sk->conn->is_nonblocking = false;	/* connections always start in
+										 * blocking mode */
+	sk->conn->recvbuf = NULL;
 }

 static WalProposerConnectPollStatusType
@@ -680,7 +667,7 @@ walprop_connect_poll(Safekeeper *sk)
 			 * unused. We'll expect it's never returned.
 			 */
 		case PGRES_POLLING_ACTIVE:
-			wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");

 			/*
 			 * This return is never actually reached, but it's here to make
@@ -696,33 +683,26 @@ walprop_connect_poll(Safekeeper *sk)
 	return return_val;
 }

-extern bool
-libpqwp_send_query(WalProposerConn *conn, char *query)
+static bool
+walprop_send_query(Safekeeper *sk, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
 	 * requiring a call to PQflush
 	 */
-	if (!ensure_nonblocking_status(conn, false))
+	if (!ensure_nonblocking_status(sk->conn, false))
 		return false;

 	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(conn->pg_conn, query))
+	if (!PQsendQuery(sk->conn->pg_conn, query))
 		return false;

 	return true;
 }

-static bool
-walprop_send_query(Safekeeper *sk, char *query)
+static WalProposerExecStatusType
+walprop_get_query_result(Safekeeper *sk)
 {
-	return libpqwp_send_query(sk->conn, query);
-}
-
-WalProposerExecStatusType
-libpqwp_get_query_result(WalProposerConn *conn)
-{
-
 	PGresult   *result;
 	WalProposerExecStatusType return_val;

@@ -730,14 +710,14 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	char	   *unexpected_success = NULL;

 	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(conn->pg_conn))
+	if (!PQconsumeInput(sk->conn->pg_conn))
 		return WP_EXEC_FAILED;

-	if (PQisBusy(conn->pg_conn))
+	if (PQisBusy(sk->conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;


-	result = PQgetResult(conn->pg_conn);
+	result = PQgetResult(sk->conn->pg_conn);

 	/*
 	 * PQgetResult returns NULL only if getting the result was successful &
@@ -745,7 +725,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	 */
 	if (!result)
 	{
-		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
 		return WP_EXEC_UNEXPECTED_SUCCESS;
 	}

@@ -793,17 +773,11 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	}

 	if (unexpected_success)
-		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);

 	return return_val;
 }

-static WalProposerExecStatusType
-walprop_get_query_result(Safekeeper *sk)
-{
-	return libpqwp_get_query_result(sk->conn);
-}
-
 static pgsocket
 walprop_socket(Safekeeper *sk)
 {
@@ -816,31 +790,42 @@ walprop_flush(Safekeeper *sk)
 	return (PQflush(sk->conn->pg_conn));
 }

-/* Like libpqrcv_receive. *buf is valid until the next call. */
-PGAsyncReadResult
-libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
+static void
+walprop_finish(Safekeeper *sk)
 {
-	int			rawlen;
+	if (!sk->conn)
+		return;

-	if (conn->recvbuf != NULL)
+	if (sk->conn->recvbuf != NULL)
+		PQfreemem(sk->conn->recvbuf);
+	PQfinish(sk->conn->pg_conn);
+	pfree(sk->conn);
+	sk->conn = NULL;
+}
+
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+static PGAsyncReadResult
+walprop_async_read(Safekeeper *sk, char **buf, int *amount)
+{
+	int			result;
+
+	if (sk->conn->recvbuf != NULL)
 	{
-		PQfreemem(conn->recvbuf);
-		conn->recvbuf = NULL;
+		PQfreemem(sk->conn->recvbuf);
+		sk->conn->recvbuf = NULL;
 	}

-	/* Try to receive a CopyData message */
-	rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
-	if (rawlen == 0)
+	/* Call PQconsumeInput so that we have the data we need */
+	if (!PQconsumeInput(sk->conn->pg_conn))
 	{
-		/* Try consuming some data. */
-		if (!PQconsumeInput(conn->pg_conn))
-		{
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_FAIL;
-		}
-		/* Now that we've consumed some input, try again */
-		rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
+		*amount = 0;
+		*buf = NULL;
+		return PG_ASYNC_READ_FAIL;
 	}

 	/*
@@ -854,7 +839,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (rawlen)
+	switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
 	{
 		case 0:
 			*amount = 0;
@@ -869,10 +854,10 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 				 * We can check PQgetResult to make sure that the server
 				 * failed; it'll always result in PGRES_FATAL_ERROR
 				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
+				ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));

 				if (status != PGRES_FATAL_ERROR)
-					wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);

 				/*
 				 * If there was actually an error, it'll be properly reported
@@ -889,24 +874,12 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 			return PG_ASYNC_READ_FAIL;
 		default:
 			/* Positive values indicate the size of the returned result */
-			*amount = rawlen;
-			*buf = conn->recvbuf;
+			*amount = result;
+			*buf = sk->conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }

-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-static PGAsyncReadResult
-walprop_async_read(Safekeeper *sk, char **buf, int *amount)
-{
-	return libpqwp_async_read(sk->conn, buf, amount);
-}
-
 static PGAsyncWriteResult
 walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 {
@@ -937,7 +910,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
+			elog(FATAL, "invalid return %d from PQputCopyData", result);
 	}

 	/*
@@ -958,7 +931,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			wpg_log(FATAL, "invalid return %d from PQflush", result);
+			elog(FATAL, "invalid return %d from PQflush", result);
 	}
 }

@@ -989,33 +962,6 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
 	return true;
 }

-void
-libpqwp_disconnect(WalProposerConn *conn)
-{
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
-}
-
-static void
-walprop_finish(Safekeeper *sk)
-{
-	if (sk->conn)
-	{
-		libpqwp_disconnect(sk->conn);
-		sk->conn = NULL;
-	}
-
-	/* free xlogreader */
-	if (sk->xlogreader)
-	{
-		NeonWALReaderFree(sk->xlogreader);
-		sk->xlogreader = NULL;
-	}
-	rm_safekeeper_event_set(sk, false);
-}
-
 /*
 * Subscribe for new WAL and stream it in the loop to safekeepers.
 *
@@ -1219,25 +1165,16 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	}
 }

-/* Download WAL before basebackup for logical walsenders from sk, if needed */
+/*
+ * Receive WAL from most advanced safekeeper
+ */
 static bool
-WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
+WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
 {
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 	char		conninfo[MAXCONNINFO];
-	TimeLineID	timeline;
-	XLogRecPtr	startpos;
-	XLogRecPtr	endpos;
-	uint64		download_range_mb;
-
-	startpos = GetLogRepRestartLSN(wp);
-	if (startpos == InvalidXLogRecPtr)
-		return true;			/* recovery not needed */
-	endpos = wp->propEpochStartLsn;
-
-	timeline = wp->greetRequest.timeline;

 	if (!neon_auth_token)
 	{
@@ -1249,7 +1186,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)

 		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
 		if (written > MAXCONNINFO || written < 0)
-			wpg_log(FATAL, "could not append password to the safekeeper connection string");
+			elog(FATAL, "could not append password to the safekeeper connection string");
 	}

 #if PG_MAJORVERSION_NUM < 16
@@ -1266,11 +1203,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 						err)));
 		return false;
 	}
-	wpg_log(LOG,
-			"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
-			"%d",
-			sk->host, sk->port, (uint32) (startpos >> 32),
-			(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+	elog(LOG,
+		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "%d",
+		 sk->host, sk->port, (uint32) (startpos >> 32),
+		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);

 	options.logical = false;
 	options.startpoint = startpos;
@@ -1463,54 +1400,28 @@ XLogWalPropClose(XLogRecPtr recptr)
 	walpropFile = -1;
 }

+static void
+walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
+{
+	WALReadError errinfo;
+
+	if (!WALRead(sk->xlogreader,
+				 buf,
+				 startptr,
+				 count,
+				 walprop_pg_get_timeline_id(),
+				 &errinfo))
+	{
+		WALReadRaiseError(&errinfo);
+	}
+}
+
 static void
 walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
-	char		log_prefix[64];
-
-	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
-	Assert(!sk->xlogreader);
-	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
+	sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
 	if (sk->xlogreader == NULL)
-		wpg_log(FATAL, "failed to allocate xlog reader");
-}
-
-static NeonWALReadResult
-walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg)
-{
-	NeonWALReadResult res;
-
-	res = NeonWALRead(sk->xlogreader,
-					  buf,
-					  startptr,
-					  count,
-					  walprop_pg_get_timeline_id());
-
-	if (res == NEON_WALREAD_SUCCESS)
-	{
-		/*
-		 * If we have the socket subscribed, but walreader doesn't need any
-		 * events, it must mean that remote connection just closed hoping to
-		 * do next read locally. Remove the socket then. It is important to do
-		 * as otherwise next read might open another connection and we won't
-		 * be able to distinguish whether we have correct socket added in wait
-		 * event set.
-		 */
-		if (NeonWALReaderEvents(sk->xlogreader) == 0)
-			rm_safekeeper_event_set(sk, false);
-	}
-	else if (res == NEON_WALREAD_ERROR)
-	{
-		*errmsg = NeonWALReaderErrMsg(sk->xlogreader);
-	}
-
-	return res;
-}
-
-static uint32
-walprop_pg_wal_reader_events(Safekeeper *sk)
-{
-	return NeonWALReaderEvents(sk->xlogreader);
+		elog(FATAL, "Failed to allocate xlog reader");
 }

 static WaitEventSet *waitEvents;
@@ -1527,8 +1438,6 @@ walprop_pg_free_event_set(WalProposer *wp)
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		wp->safekeeper[i].eventPos = -1;
-		wp->safekeeper[i].nwrEventPos = -1;
-		wp->safekeeper[i].nwrConnEstablished = false;
 	}
 }

@@ -1536,39 +1445,13 @@ static void
 walprop_pg_init_event_set(WalProposer *wp)
 {
 	if (waitEvents)
-		wpg_log(FATAL, "double-initialization of event set");
+		elog(FATAL, "double-initialization of event set");

-	/* for each sk, we have socket plus potentially socket for neon walreader */
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		wp->safekeeper[i].eventPos = -1;
-		wp->safekeeper[i].nwrEventPos = -1;
-		wp->safekeeper[i].nwrConnEstablished = false;
-	}
-}
-
-/* add safekeeper socket to wait event set */
-static void
-walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
-{
-	Assert(sk->eventPos == -1);
-	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
-}
-
-/* add neon wal reader socket to wait event set */
-static void
-add_nwr_event_set(Safekeeper *sk, uint32 events)
-{
-	Assert(sk->nwrEventPos == -1);
-	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
-	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
-	wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }

 static void
@@ -1580,144 +1463,10 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
 	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }

-/*
- * Update neon_walreader event.
- * Can be called when nwr socket doesn't exist, does nothing in this case.
- */
 static void
-update_nwr_event_set(Safekeeper *sk, uint32 events)
+walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
 {
-	/* eventPos = -1 when we don't have an event */
-	if (sk->nwrEventPos != -1)
-		ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
-}
-
-
-static void
-walprop_pg_active_state_update_event_set(Safekeeper *sk)
-{
-	uint32		sk_events;
-	uint32		nwr_events;
-
-	Assert(sk->state == SS_ACTIVE);
-	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-	/*
-	 * If we need to wait for neon_walreader, ensure we have up to date socket
-	 * in the wait event set.
-	 */
-	if (sk->active_state == SS_ACTIVE_READ_WAL)
-	{
-		/*
-		 * If conn is established and socket is thus stable, update the event
-		 * directly; otherwise re-add it.
-		 */
-		if (sk->nwrConnEstablished)
-		{
-			Assert(sk->nwrEventPos != -1);
-			update_nwr_event_set(sk, nwr_events);
-		}
-		else
-		{
-			rm_safekeeper_event_set(sk, false);
-			add_nwr_event_set(sk, nwr_events);
-		}
-	}
-	else
-	{
-		/*
-		 * Hack: we should always set 0 here, but for random reasons
-		 * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
-		 * some event. Since there is also no way to remove socket except
-		 * reconstructing the whole set, SafekeeperStateDesiredEvents instead
-		 * gives WL_SOCKET_CLOSED if socket exists. We never expect it to
-		 * trigger.
-		 *
-		 * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event
-		 * removal.
-		 */
-#if PG_VERSION_NUM >= 150000
-		Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
-		update_nwr_event_set(sk, WL_SOCKET_CLOSED);
-#else							/* pg 14 */
-		rm_safekeeper_event_set(sk, false);
-#endif
-	}
-	walprop_pg_update_event_set(sk, sk_events);
-}
-
-static void
-walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
-{
-	rm_safekeeper_event_set(to_remove, true);
-}
-
-/*
- * A hacky way to remove single event from the event set. Can be called if event
- * doesn't exist, does nothing in this case.
- *
- * Note: Internally, this completely reconstructs the event set. It should be
- * avoided if possible.
- *
- * If is_sk is true, socket of connection to safekeeper is removed; otherwise
- * socket of neon_walreader.
- */
-static void
-rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
-{
-	WalProposer *wp = to_remove->wp;
-
-	wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
-			to_remove->host, to_remove->port, is_sk);
-
-	/*
-	 * Shortpath for exiting if have nothing to do. We never call this
-	 * function with safekeeper socket not existing, but do that with neon
-	 * walreader socket.
-	 */
-	if ((is_sk && to_remove->eventPos == -1) ||
-		(!is_sk && to_remove->nwrEventPos == -1))
-	{
-		return;
-	}
-
-	/* Remove the existing event set, assign sk->eventPos = -1 */
-	walprop_pg_free_event_set(wp);
-
-	/* Re-initialize it without adding any safekeeper events */
-	wp->api.init_event_set(wp);
-
-	/*
-	 * loop through the existing safekeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		Safekeeper *sk = &wp->safekeeper[i];
-
-		/*
-		 * If this safekeeper isn't offline, add events for it, except for the
-		 * event requested to remove.
-		 */
-		if (sk->state != SS_OFFLINE)
-		{
-			uint32		sk_events;
-			uint32		nwr_events;
-
-			SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
-
-			if (sk != to_remove || !is_sk)
-			{
-				/* will set sk->eventPos */
-				wp->api.add_safekeeper_event_set(sk, sk_events);
-			}
-			if ((sk != to_remove || is_sk) && nwr_events)
-			{
-				add_nwr_event_set(sk, nwr_events);
-			}
-		}
-	}
+	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
 }

 static int
@@ -1735,8 +1484,8 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 		ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);

 	/*
-	 * Now that we prepared the condvar, check flush ptr again -- it might
-	 * have changed before we subscribed to cv so we missed the wakeup.
+	 * Now that we prepared the condvar, check flush ptr again -- it might have
+	 * changed before we subscribed to cv so we missed the wakeup.
 	 *
 	 * Do that only when we're interested in new WAL: without sync-safekeepers
 	 * and if election already passed.
@@ -1799,7 +1548,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 }

 /*
- * Choose most advanced PageserverFeedback and set it to *rf.
+ * Get PageserverFeedback fields from the most advanced safekeeper
 */
 static void
 GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
@@ -1822,13 +1571,15 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
 	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;

-	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-			rf->currentClusterSize,
-			LSN_FORMAT_ARGS(rf->last_received_lsn),
-			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-			rf->replytime);
+	elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
+		 " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
+		 rf->currentClusterSize,
+		 LSN_FORMAT_ARGS(rf->last_received_lsn),
+		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
+		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
+		 rf->replytime);
+
+	replication_feedback_set(rf);
 }

 /*
@@ -1868,69 +1619,63 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 		hs->catalog_xmin = InvalidFullTransactionId;
 }

-/*
- * Based on commitLsn and safekeeper responses including pageserver feedback,
- * 1) Propagate cluster size received from ps to ensure the limit.
- * 2) Propagate pageserver LSN positions to ensure backpressure limits.
- * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters).
- * 4) Propagate hot standby feedback.
- *
- * None of that is functional in sync-safekeepers.
- */
 static void
 walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 {
 	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	oldDiskConsistentLsn;
+	XLogRecPtr	diskConsistentLsn;

-	if (wp->config->syncSafekeepers)
-		return;
+	diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;

-	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
-
-	/* Get PageserverFeedback fields from the most advanced safekeeper */
-	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-	replication_feedback_set(&quorumFeedback.rf);
-	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-
-	if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	if (!wp->config->syncSafekeepers)
 	{
+		/* Get PageserverFeedback fields from the most advanced safekeeper */
+		GetLatestNeonFeedback(&quorumFeedback.rf, wp);
+		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
+	}
+
+	if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
+	{
+
 		if (commitLsn > quorumFeedback.flushLsn)
 			quorumFeedback.flushLsn = commitLsn;

-		/*
-		 * Advance the replication slot to commitLsn. WAL before it is
-		 * hardened and will be fetched from one of safekeepers by
-		 * neon_walreader if needed.
-		 *
-		 * Also wakes up syncrep waiters.
-		 */
-		ProcessStandbyReply(
-		/* write_lsn -  This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
-		/* flush_lsn - This is what durably stored in WAL service. */
-							quorumFeedback.flushLsn,
+		/* advance the replication slot */
+		if (!wp->config->syncSafekeepers)
+			ProcessStandbyReply(
+			/* write_lsn -  This is what durably stored in WAL service. */
+								quorumFeedback.flushLsn,
+			/* flush_lsn - This is what durably stored in WAL service. */
+								quorumFeedback.flushLsn,

-		/*
-		 * apply_lsn - This is what processed and durably saved at*
-		 * pageserver.
-		 */
-							quorumFeedback.rf.disk_consistent_lsn,
-							walprop_pg_get_current_timestamp(wp), false);
+			/*
+			 * apply_lsn - This is what processed and durably saved at*
+			 * pageserver.
+			 */
+								quorumFeedback.rf.disk_consistent_lsn,
+								walprop_pg_get_current_timestamp(wp), false);
 	}

 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
 	{
 		quorumFeedback.hs = hsFeedback;
-		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(hsFeedback.xmin),
-								 EpochFromFullTransactionId(hsFeedback.xmin),
-								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+		if (!wp->config->syncSafekeepers)
+			ProcessStandbyHSFeedback(hsFeedback.ts,
+									 XidFromFullTransactionId(hsFeedback.xmin),
+									 EpochFromFullTransactionId(hsFeedback.xmin),
+									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 }

+static void
+walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
+{
+	if (MyReplicationSlot)
+		PhysicalConfirmReceivedLocation(lsn);
+}
+
 static XLogRecPtr
 walprop_pg_get_redo_start_lsn(WalProposer *wp)
 {
@@ -1949,15 +1694,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
 }

-static XLogRecPtr
-GetLogRepRestartLSN(WalProposer *wp)
+static void
+walprop_pg_after_election(WalProposer *wp)
 {
 	FILE	   *f;
-	XLogRecPtr	lrRestartLsn = InvalidXLogRecPtr;
+	XLogRecPtr	lrRestartLsn;

 	/* We don't need to do anything in syncSafekeepers mode. */
 	if (wp->config->syncSafekeepers)
-		return InvalidXLogRecPtr;
+		return;

 	/*
 	 * If there are active logical replication subscription we need to provide
@@ -1965,40 +1710,22 @@ GetLogRepRestartLSN(WalProposer *wp)
 	 * replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
-	if (f != NULL)
+	if (f != NULL && !wp->config->syncSafekeepers)
 	{
-		size_t		rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
-
+		size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
 		fclose(f);
 		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
 		{
-			uint64		download_range_mb;
-
-			wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
-
-			/*
-			 * If we need to download more than a max_slot_wal_keep_size,
-			 * don't do it to avoid risk of exploding pg_wal. Logical
-			 * replication won't work until recreated, but at least compute
-			 * would start; this also follows max_slot_wal_keep_size
-			 * semantics.
-			 */
-			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
-			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
-			{
-				wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
-						LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
-				return InvalidXLogRecPtr;
-			}
+			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));

 			/*
 			 * start from the beginning of the segment to fetch page headers
 			 * verifed by XLogReader
 			 */
 			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
+			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
 		}
 	}
-	return lrRestartLsn;
 }

 static const walproposer_api walprop_pg = {
@@ -2018,18 +1745,18 @@ static const walproposer_api walprop_pg = {
 	.conn_async_write = walprop_async_write,
 	.conn_blocking_write = walprop_blocking_write,
 	.recovery_download = WalProposerRecovery,
-	.wal_reader_allocate = walprop_pg_wal_reader_allocate,
 	.wal_read = walprop_pg_wal_read,
-	.wal_reader_events = walprop_pg_wal_reader_events,
+	.wal_reader_allocate = walprop_pg_wal_reader_allocate,
+	.free_event_set = walprop_pg_free_event_set,
 	.init_event_set = walprop_pg_init_event_set,
 	.update_event_set = walprop_pg_update_event_set,
-	.active_state_update_event_set = walprop_pg_active_state_update_event_set,
 	.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
-	.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
 	.wait_event_set = walprop_pg_wait_event_set,
 	.strong_random = walprop_pg_strong_random,
 	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
 	.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
+	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
 	.log_internal = walprop_pg_log_internal,
+	.after_election = walprop_pg_after_election,
 };
--- a/poetry.lock
+++ b/poetry.lock
@@ -339,19 +339,19 @@ uvloop = ["uvloop (>=0.15.2)"]

 [[package]]
 name = "boto3"
-version = "1.34.11"
+version = "1.26.16"
 description = "The AWS SDK for Python"
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">= 3.7"
 files = [
-    {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
-    {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
+    {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
+    {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
 ]

 [package.dependencies]
-botocore = ">=1.34.11,<1.35.0"
+botocore = ">=1.29.16,<1.30.0"
 jmespath = ">=0.7.1,<2.0.0"
-s3transfer = ">=0.10.0,<0.11.0"
+s3transfer = ">=0.6.0,<0.7.0"

 [package.extras]
 crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
@@ -702,25 +702,22 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]

 [[package]]
 name = "botocore"
-version = "1.34.11"
+version = "1.29.16"
 description = "Low-level, data-driven core of boto 3."
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">= 3.7"
 files = [
-    {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
-    {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
+    {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
+    {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
 ]

 [package.dependencies]
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
-urllib3 = [
-    {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
-    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
-]
+urllib3 = ">=1.25.4,<1.27"

 [package.extras]
-crt = ["awscrt (==0.19.19)"]
+crt = ["awscrt (==0.14.0)"]

 [[package]]
 name = "botocore-stubs"
@@ -1892,13 +1889,13 @@ files = [

 [[package]]
 name = "pytest"
-version = "7.4.4"
+version = "7.3.1"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
-    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
+    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
+    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
 ]

 [package.dependencies]
@@ -1910,7 +1907,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}

 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]

 [[package]]
 name = "pytest-asyncio"
@@ -2233,20 +2230,20 @@ files = [

 [[package]]
 name = "s3transfer"
-version = "0.10.0"
+version = "0.6.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">= 3.7"
 files = [
-    {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
-    {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
+    {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
+    {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
 ]

 [package.dependencies]
-botocore = ">=1.33.2,<2.0a.0"
+botocore = ">=1.12.36,<2.0a.0"

 [package.extras]
-crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
+crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]

 [[package]]
 name = "sarif-om"
@@ -2743,4 +2740,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "8de8b05a9b35a6f76da7d7e3652ddbb521f1eca53fce7b933f537080a9d6eada"
+content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -11,7 +11,6 @@ use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
-use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;

 use anyhow::bail;
@@ -96,8 +95,12 @@ struct ProxyCliArgs {
    /// Allow self-signed certificates for compute nodes (for testing)
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    allow_self_signed_compute: bool,
-    #[clap(flatten)]
-    sql_over_http: SqlOverHttpArgs,
+    /// timeout for http connections
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    sql_over_http_timeout: tokio::time::Duration,
+    /// Whether the SQL over http pool is opt-in
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    sql_over_http_pool_opt_in: bool,
    /// timeout for scram authentication protocol
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    scram_protocol_timeout: tokio::time::Duration,
@@ -135,36 +138,6 @@ struct ProxyCliArgs {
    disable_ip_check_for_http: bool,
 }

-#[derive(clap::Args, Clone, Copy, Debug)]
-struct SqlOverHttpArgs {
-    /// timeout for http connection requests
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    sql_over_http_timeout: tokio::time::Duration,
-
-    /// Whether the SQL over http pool is opt-in
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    sql_over_http_pool_opt_in: bool,
-
-    /// How many connections to pool for each endpoint. Excess connections are discarded
-    #[clap(long, default_value_t = 20)]
-    sql_over_http_pool_max_conns_per_endpoint: usize,
-
-    /// How long pooled connections should remain idle for before closing
-    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
-    sql_over_http_idle_timeout: tokio::time::Duration,
-
-    /// Duration each shard will wait on average before a GC sweep.
-    /// A longer time will causes sweeps to take longer but will interfere less frequently.
-    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
-    sql_over_http_pool_gc_epoch: tokio::time::Duration,
-
-    /// How many shards should the global pool have. Must be a power of two.
-    /// More shards will introduce less contention for pool operations, but can
-    /// increase memory used by the pool
-    #[clap(long, default_value_t = 128)]
-    sql_over_http_pool_shards: usize,
-}
-
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let _logging_guard = proxy::logging::init().await?;
@@ -354,14 +327,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        }
    };
    let http_config = HttpConfig {
-        request_timeout: args.sql_over_http.sql_over_http_timeout,
-        pool_options: GlobalConnPoolOptions {
-            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
-            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
-            pool_shards: args.sql_over_http.sql_over_http_pool_shards,
-            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
-            opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
-        },
+        timeout: args.sql_over_http_timeout,
+        pool_opt_in: args.sql_over_http_pool_opt_in,
    };
    let authentication_config = AuthenticationConfig {
        scram_protocol_timeout: args.scram_protocol_timeout,
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,4 @@
-use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
+use crate::{auth, rate_limiter::RateBucketInfo};
 use anyhow::{bail, ensure, Context, Ok};
 use rustls::{sign, Certificate, PrivateKey};
 use sha2::{Digest, Sha256};
@@ -36,8 +36,8 @@ pub struct TlsConfig {
 }

 pub struct HttpConfig {
-    pub request_timeout: tokio::time::Duration,
-    pub pool_options: GlobalConnPoolOptions,
+    pub timeout: tokio::time::Duration,
+    pub pool_opt_in: bool,
 }

 pub struct AuthenticationConfig {
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -11,7 +11,7 @@ use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use std::sync::Arc;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -141,7 +141,7 @@ impl Api {
            // We'll set username and such later using the startup message.
            // TODO: add more type safety (in progress).
            let mut config = compute::ConnCfg::new();
-            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.

            let node = NodeInfo {
                config,
@@ -269,10 +269,9 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
    Err(ApiError::Console { status, text })
 }

-fn parse_host_port(input: &str) -> Option<(&str, u16)> {
-    let (host, port) = input.rsplit_once(':')?;
-    let ipv6_brackets: &[_] = &['[', ']'];
-    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
+fn parse_host_port(input: &str) -> Option<(String, u16)> {
+    let parsed: SocketAddr = input.parse().ok()?;
+    Some((parsed.ip().to_string(), parsed.port()))
 }

 #[cfg(test)]
@@ -280,24 +279,9 @@ mod tests {
    use super::*;

    #[test]
-    fn test_parse_host_port_v4() {
+    fn test_parse_host_port() {
        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
        assert_eq!(host, "127.0.0.1");
        assert_eq!(port, 5432);
    }
-
-    #[test]
-    fn test_parse_host_port_v6() {
-        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
-        assert_eq!(host, "2001:db8::1");
-        assert_eq!(port, 5432);
-    }
-
-    #[test]
-    fn test_parse_host_port_url() {
-        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
-            .expect("failed to parse");
-        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
-        assert_eq!(port, 5432);
-    }
 }
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,13 +6,9 @@ mod conn_pool;
 mod sql_over_http;
 mod websocket;

-pub use conn_pool::GlobalConnPoolOptions;
-
 use anyhow::bail;
 use hyper::StatusCode;
 use metrics::IntCounterPairGuard;
-use rand::rngs::StdRng;
-use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
@@ -51,11 +47,6 @@ pub async fn task_main(

    let conn_pool = conn_pool::GlobalConnPool::new(config);

-    let conn_pool2 = Arc::clone(&conn_pool);
-    tokio::spawn(async move {
-        conn_pool2.gc_worker(StdRng::from_entropy()).await;
-    });
-
    // shutdown the connection pool
    tokio::spawn({
        let cancellation_token = cancellation_token.clone();
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,19 +1,15 @@
 use anyhow::{anyhow, Context};
 use async_trait::async_trait;
 use dashmap::DashMap;
-use futures::{future::poll_fn, Future};
-use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
-use once_cell::sync::Lazy;
+use futures::future::poll_fn;
 use parking_lot::RwLock;
 use pbkdf2::{
    password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
    Params, Pbkdf2,
 };
 use pq_proto::StartupMessageParams;
-use prometheus::{exponential_buckets, register_histogram, Histogram};
-use rand::Rng;
 use smol_str::SmolStr;
-use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
+use std::{collections::HashMap, net::IpAddr, sync::Arc};
 use std::{
    fmt,
    task::{ready, Poll},
@@ -22,7 +18,7 @@ use std::{
    ops::Deref,
    sync::atomic::{self, AtomicUsize},
 };
-use tokio::time::{self, Instant};
+use tokio::time;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
@@ -34,10 +30,11 @@ use crate::{
 };
 use crate::{compute, config};

-use tracing::{debug, error, warn, Span};
+use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};

 pub const APP_NAME: &str = "/sql_over_http";
+const MAX_CONNS_PER_ENDPOINT: usize = 20;

 #[derive(Debug, Clone)]
 pub struct ConnInfo {
@@ -72,77 +69,6 @@ struct ConnPoolEntry {
 pub struct EndpointConnPool {
    pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
    total_conns: usize,
-    max_conns: usize,
-    _guard: IntCounterPairGuard,
-}
-
-impl EndpointConnPool {
-    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
-        let Self {
-            pools, total_conns, ..
-        } = self;
-        pools
-            .get_mut(&db_user)
-            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
-    }
-
-    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
-        let Self {
-            pools, total_conns, ..
-        } = self;
-        if let Some(pool) = pools.get_mut(&db_user) {
-            let old_len = pool.conns.len();
-            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
-            let new_len = pool.conns.len();
-            let removed = old_len - new_len;
-            *total_conns -= removed;
-            removed > 0
-        } else {
-            false
-        }
-    }
-
-    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
-        let conn_id = client.conn_id;
-
-        if client.inner.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return Ok(());
-        }
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < pool.max_conns {
-                // we create this db-user entry in get, so it should not be None
-                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    pool_entries.conns.push(ConnPoolEntry {
-                        conn: client,
-                        _last_access: std::time::Instant::now(),
-                    });
-
-                    returned = true;
-                    per_db_size = pool_entries.conns.len();
-
-                    pool.total_conns += 1;
-                }
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-
-        Ok(())
-    }
 }

 /// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
@@ -161,27 +87,6 @@ pub struct DbUserConnPool {
    password_hash: Option<PasswordHashString>,
 }

-impl DbUserConnPool {
-    fn clear_closed_clients(&mut self, conns: &mut usize) {
-        let old_len = self.conns.len();
-
-        self.conns.retain(|conn| !conn.conn.inner.is_closed());
-
-        let new_len = self.conns.len();
-        let removed = old_len - new_len;
-        *conns -= removed;
-    }
-
-    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
-        self.clear_closed_clients(conns);
-        let conn = self.conns.pop();
-        if conn.is_some() {
-            *conns -= 1;
-        }
-        conn
-    }
-}
-
 pub struct GlobalConnPool {
    // endpoint -> per-endpoint connection pool
    //
@@ -189,127 +94,52 @@ pub struct GlobalConnPool {
    // pool as early as possible and release the lock.
    global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,

-    /// Number of endpoint-connection pools
-    ///
    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
    /// It's only used for diagnostics.
    global_pool_size: AtomicUsize,

-    proxy_config: &'static crate::config::ProxyConfig,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct GlobalConnPoolOptions {
    // Maximum number of connections per one endpoint.
    // Can mix different (dbname, username) connections.
    // When running out of free slots for a particular endpoint,
    // falls back to opening a new connection for each request.
-    pub max_conns_per_endpoint: usize,
+    max_conns_per_endpoint: usize,

-    pub gc_epoch: Duration,
+    proxy_config: &'static crate::config::ProxyConfig,

-    pub pool_shards: usize,
-
-    pub idle_timeout: Duration,
-
-    pub opt_in: bool,
+    // Using a lock to remove any race conditions.
+    // Eg cleaning up connections while a new connection is returned
+    closed: RwLock<bool>,
 }

-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
-
-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
-
 impl GlobalConnPool {
    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
-        let shards = config.http_config.pool_options.pool_shards;
        Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
+            global_pool: DashMap::new(),
            global_pool_size: AtomicUsize::new(0),
+            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
            proxy_config: config,
+            closed: RwLock::new(false),
        })
    }

    pub fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
+        *self.closed.write() = true;

-    pub async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
+        self.global_pool.retain(|_, endpoint_pool| {
+            let mut pool = endpoint_pool.write();
+            // by clearing this hashmap, we remove the slots that a connection can be returned to.
+            // when returning, it drops the connection if the slot doesn't exist
+            pool.pools.clear();
+            pool.total_conns = 0;

-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = GC_LATENCY.start_timer();
-        let current_len = shard.len();
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool {
-                    pools, total_conns, ..
-                } = pool.get_mut();
-
-                // ensure that closed clients are removed
-                pools
-                    .iter_mut()
-                    .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
-
-                // we only remove this pool if it has no active connections
-                if *total_conns == 0 {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
+            false
        });
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe_duration();
-
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
    }

    pub async fn get(
        self: &Arc<Self>,
-        conn_info: ConnInfo,
+        conn_info: &ConnInfo,
        force_new: bool,
        session_id: uuid::Uuid,
        peer_addr: IpAddr,
@@ -317,11 +147,15 @@ impl GlobalConnPool {
        let mut client: Option<ClientInner> = None;
        let mut latency_timer = LatencyTimer::new("http");

+        let pool = if force_new {
+            None
+        } else {
+            Some((conn_info.clone(), self.clone()))
+        };
+
        let mut hash_valid = false;
-        let mut endpoint_pool = Weak::new();
        if !force_new {
            let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
-            endpoint_pool = Arc::downgrade(&pool);
            let mut hash = None;

            // find a pool entry by (dbname, username) if exists
@@ -346,8 +180,12 @@ impl GlobalConnPool {
                // we will continue with the regular connection flow
                if validate.is_ok() {
                    hash_valid = true;
-                    if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
-                        client = Some(entry.conn)
+                    let mut pool = pool.write();
+                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                        if let Some(entry) = pool_entries.conns.pop() {
+                            client = Some(entry.conn);
+                            pool.total_conns -= 1;
+                        }
                    }
                }
            }
@@ -360,12 +198,11 @@ impl GlobalConnPool {
                info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
                connect_to_compute(
                    self.proxy_config,
-                    &conn_info,
+                    conn_info,
                    conn_id,
                    session_id,
                    latency_timer,
                    peer_addr,
-                    endpoint_pool.clone(),
                )
                .await
            } else {
@@ -377,19 +214,18 @@ impl GlobalConnPool {
                );
                latency_timer.pool_hit();
                latency_timer.success();
-                return Ok(Client::new(client, conn_info, endpoint_pool).await);
+                return Ok(Client::new(client, pool).await);
            }
        } else {
            let conn_id = uuid::Uuid::new_v4();
            info!(%conn_id, "pool: opening a new connection '{conn_info}'");
            connect_to_compute(
                self.proxy_config,
-                &conn_info,
+                conn_info,
                conn_id,
                session_id,
                latency_timer,
                peer_addr,
-                endpoint_pool.clone(),
            )
            .await
        };
@@ -433,7 +269,59 @@ impl GlobalConnPool {
            _ => {}
        }
        let new_client = new_client?;
-        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
+        Ok(Client::new(new_client, pool).await)
+    }
+
+    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+        let conn_id = client.conn_id;
+
+        // We want to hold this open while we return. This ensures that the pool can't close
+        // while we are in the middle of returning the connection.
+        let closed = self.closed.read();
+        if *closed {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
+            return Ok(());
+        }
+
+        if client.inner.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return Ok(());
+        }
+
+        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < self.max_conns_per_endpoint {
+                // we create this db-user entry in get, so it should not be None
+                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    pool_entries.conns.push(ConnPoolEntry {
+                        conn: client,
+                        _last_access: std::time::Instant::now(),
+                    });
+
+                    returned = true;
+                    per_db_size = pool_entries.conns.len();
+
+                    pool.total_conns += 1;
+                }
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+
+        Ok(())
    }

    fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
@@ -446,12 +334,6 @@ impl GlobalConnPool {
        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
            pools: HashMap::new(),
            total_conns: 0,
-            max_conns: self
-                .proxy_config
-                .http_config
-                .pool_options
-                .max_conns_per_endpoint,
-            _guard: ENDPOINT_POOLS.guard(),
        }));

        // find or create a pool for this endpoint
@@ -481,11 +363,9 @@ impl GlobalConnPool {
 }

 struct TokioMechanism<'a> {
-    pool: Weak<RwLock<EndpointConnPool>>,
    conn_info: &'a ConnInfo,
    session_id: uuid::Uuid,
    conn_id: uuid::Uuid,
-    idle: Duration,
 }

 #[async_trait]
@@ -505,8 +385,6 @@ impl ConnectMechanism for TokioMechanism<'_> {
            timeout,
            self.conn_id,
            self.session_id,
-            self.pool.clone(),
-            self.idle,
        )
        .await
    }
@@ -525,7 +403,6 @@ async fn connect_to_compute(
    session_id: uuid::Uuid,
    latency_timer: LatencyTimer,
    peer_addr: IpAddr,
-    pool: Weak<RwLock<EndpointConnPool>>,
 ) -> anyhow::Result<ClientInner> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -570,8 +447,6 @@ async fn connect_to_compute(
            conn_id,
            conn_info,
            session_id,
-            pool,
-            idle: config.http_config.pool_options.idle_timeout,
        },
        node_info,
        &extra,
@@ -587,8 +462,6 @@ async fn connect_to_compute_once(
    timeout: time::Duration,
    conn_id: uuid::Uuid,
    mut session: uuid::Uuid,
-    pool: Weak<RwLock<EndpointConnPool>>,
-    idle: Duration,
 ) -> Result<ClientInner, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

@@ -617,29 +490,13 @@ async fn connect_to_compute_once(
        branch_id: node_info.aux.branch_id.clone(),
    };

-    let db_user = conn_info.db_and_user();
    tokio::spawn(
        async move {
            let _conn_gauge = conn_gauge;
-            let mut idle_timeout = pin!(tokio::time::sleep(idle));
            poll_fn(move |cx| {
                if matches!(rx.has_changed(), Ok(true)) {
                    session = *rx.borrow_and_update();
                    info!(%session, "changed session");
-                    idle_timeout.as_mut().reset(Instant::now() + idle);
-                }
-
-                // 5 minute idle connection timeout
-                if idle_timeout.as_mut().poll(cx).is_ready() {
-                    idle_timeout.as_mut().reset(Instant::now() + idle);
-                    info!("connection idle");
-                    if let Some(pool) = pool.clone().upgrade() {
-                        // remove client from pool - should close the connection if it's idle.
-                        // does nothing if the client is currently checked-out and in-use
-                        if pool.write().remove_client(db_user.clone(), conn_id) {
-                            info!("idle connection removed");
-                        }
-                    }
                }

                loop {
@@ -657,25 +514,15 @@ async fn connect_to_compute_once(
                        }
                        Some(Err(e)) => {
                            error!(%session, "connection error: {}", e);
-                            break
+                            return Poll::Ready(())
                        }
                        None => {
                            info!("connection closed");
-                            break
+                            return Poll::Ready(())
                        }
                    }
                }
-
-                // remove from connection pool
-                if let Some(pool) = pool.clone().upgrade() {
-                    if pool.write().remove_client(db_user.clone(), conn_id) {
-                        info!("closed connection removed");
-                    }
-                }
-
-                Poll::Ready(())
-            }).await;
-
+            }).await
        }
        .instrument(span)
    );
@@ -705,27 +552,23 @@ pub struct Client {
    conn_id: uuid::Uuid,
    span: Span,
    inner: Option<ClientInner>,
-    conn_info: ConnInfo,
-    pool: Weak<RwLock<EndpointConnPool>>,
+    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

 pub struct Discard<'a> {
    conn_id: uuid::Uuid,
-    conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<RwLock<EndpointConnPool>>,
+    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

 impl Client {
    pub(self) async fn new(
        inner: ClientInner,
-        conn_info: ConnInfo,
-        pool: Weak<RwLock<EndpointConnPool>>,
+        pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
    ) -> Self {
        Self {
            conn_id: inner.conn_id,
            inner: Some(inner),
            span: Span::current(),
-            conn_info,
            pool,
        }
    }
@@ -734,7 +577,6 @@ impl Client {
            inner,
            pool,
            conn_id,
-            conn_info,
            span: _,
        } = self;
        (
@@ -744,7 +586,6 @@ impl Client {
                .inner,
            Discard {
                pool,
-                conn_info,
                conn_id: *conn_id,
            },
        )
@@ -760,14 +601,14 @@ impl Client {

 impl Discard<'_> {
    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
+        if status != ReadyForQueryStatus::Idle {
+            if let Some((conn_info, _)) = self.pool.take() {
+                info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
+            }
        }
    }
    pub fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
+        if let Some((conn_info, _)) = self.pool.take() {
            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
        }
    }
@@ -787,17 +628,16 @@ impl Deref for Client {

 impl Drop for Client {
    fn drop(&mut self) {
-        let conn_info = self.conn_info.clone();
        let client = self
            .inner
            .take()
            .expect("client inner should not be removed");
-        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
+        if let Some((conn_info, conn_pool)) = self.pool.take() {
            let current_span = self.span.clone();
            // return connection to the pool
            tokio::task::spawn_blocking(move || {
                let _span = current_span.enter();
-                let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
+                let _ = conn_pool.put(&conn_info, client);
            });
        }
    }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -206,7 +206,7 @@ pub async fn handle(
    config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
    let result = tokio::time::timeout(
-        config.request_timeout,
+        config.timeout,
        handle_inner(
            config,
            request,
@@ -278,7 +278,7 @@ pub async fn handle(
        Err(_) => {
            let message = format!(
                "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.request_timeout.as_secs()
+                config.timeout.as_secs()
            );
            error!(message);
            json_response(
@@ -320,8 +320,7 @@ async fn handle_inner(

    // Allow connection pooling only if explicitly requested
    // or if we have decided that http pool is no longer opt-in
-    let allow_pool =
-        !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

    // isolation level, read only and deferrable

@@ -360,7 +359,7 @@ async fn handle_inner(
    let payload: Payload = serde_json::from_slice(&body)?;

    let mut client = conn_pool
-        .get(conn_info, !allow_pool, session_id, peer_addr)
+        .get(&conn_info, !allow_pool, session_id, peer_addr)
        .await?;

    let mut response = Response::builder()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ authors = []

 [tool.poetry.dependencies]
 python = "^3.9"
-pytest = "^7.4.4"
+pytest = "^7.3.1"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
@@ -17,7 +17,7 @@ aiopg = "^1.4.0"
 Jinja2 = "^3.0.2"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
-boto3 = "^1.34.11"
+boto3 = "^1.26.16"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^2.2.1"
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -6,7 +6,6 @@ license.workspace = true

 [dependencies]
 aws-sdk-s3.workspace = true
-aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
@@ -40,5 +39,3 @@ tracing-subscriber.workspace = true
 clap.workspace = true
 tracing-appender = "0.2"
 histogram = "0.7"
-
-futures.workspace = true
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -15,13 +15,10 @@ use anyhow::Context;
 use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::retry::RetryConfig;
 use aws_config::sso::SsoCredentialsProvider;
 use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
+use aws_sdk_s3::config::Region;
 use aws_sdk_s3::{Client, Config};
-use aws_smithy_async::rt::sleep::TokioSleep;

 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
@@ -258,11 +255,6 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
        let chain = CredentialsProviderChain::first_try(
            "env",
            EnvironmentVariableCredentialsProvider::new(),
-        )
-        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-        .or_else(
-            "profile-sso",
-            ProfileFileCredentialsProvider::builder().build(),
        );

        // Use SSO if we were given an account ID
@@ -273,7 +265,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
                    .account_id(sso_account)
                    .role_name("PowerUserAccess")
                    .start_url("https://neondb.awsapps.com/start")
-                    .region(bucket_region.clone())
+                    .region(Region::from_static("eu-central-1"))
                    .build(),
            ),
            None => chain,
@@ -285,13 +277,9 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
        )
    };

-    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
-
    let mut builder = Config::builder()
        .behavior_version(BehaviorVersion::v2023_11_09())
        .region(bucket_region)
-        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
-        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
        .credentials_provider(credentials_provider);

    if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,4 +1,3 @@
-use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use s3_scrubber::scan_metadata::scan_metadata;
 use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
@@ -35,8 +34,6 @@ enum Command {
    ScanMetadata {
        #[arg(short, long, default_value_t = false)]
        json: bool,
-        #[arg(long = "tenant-id", num_args = 0..)]
-        tenant_ids: Vec<TenantShardId>,
    },
 }

@@ -60,37 +57,35 @@ async fn main() -> anyhow::Result<()> {
    ));

    match cli.command {
-        Command::ScanMetadata { json, tenant_ids } => {
-            match scan_metadata(bucket_config.clone(), tenant_ids).await {
-                Err(e) => {
-                    tracing::error!("Failed: {e}");
-                    Err(e)
+        Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
+            Err(e) => {
+                tracing::error!("Failed: {e}");
+                Err(e)
+            }
+            Ok(summary) => {
+                if json {
+                    println!("{}", serde_json::to_string(&summary).unwrap())
+                } else {
+                    println!("{}", summary.summary_string());
                }
-                Ok(summary) => {
-                    if json {
-                        println!("{}", serde_json::to_string(&summary).unwrap())
-                    } else {
-                        println!("{}", summary.summary_string());
-                    }
-                    if summary.is_fatal() {
-                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                    } else if summary.is_empty() {
-                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                        // scrubber they were likely expecting to scan something, and if we see no timelines
-                        // at all then it's likely due to some configuration issues like a bad prefix
-                        Err(anyhow::anyhow!(
-                            "No timelines found in bucket {} prefix {}",
-                            bucket_config.bucket,
-                            bucket_config
-                                .prefix_in_bucket
-                                .unwrap_or("<none>".to_string())
-                        ))
-                    } else {
-                        Ok(())
-                    }
+                if summary.is_fatal() {
+                    Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                } else if summary.is_empty() {
+                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                    // scrubber they were likely expecting to scan something, and if we see no timelines
+                    // at all then it's likely due to some configuration issues like a bad prefix
+                    Err(anyhow::anyhow!(
+                        "No timelines found in bucket {} prefix {}",
+                        bucket_config.bucket,
+                        bucket_config
+                            .prefix_in_bucket
+                            .unwrap_or("<none>".to_string())
+                    ))
+                } else {
+                    Ok(())
                }
            }
-        }
+        },
        Command::FindGarbage {
            node_kind,
            depth,
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -187,17 +187,10 @@ Timeline layer count: {6}
 }

 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_metadata(
-    bucket_config: BucketConfig,
-    tenant_ids: Vec<TenantShardId>,
-) -> anyhow::Result<MetadataSummary> {
+pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;

-    let tenants = if tenant_ids.is_empty() {
-        futures::future::Either::Left(stream_tenants(&s3_client, &target))
-    } else {
-        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
-    };
+    let tenants = stream_tenants(&s3_client, &target);

    // How many tenants to process in parallel.  We need to be mindful of pageservers
    // accessing the same per tenant prefixes, so use a lower setting than pageservers.
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -4,12 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-default = []
-# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
-# which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints"]
-
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
@@ -22,7 +16,6 @@ chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 crc32c.workspace = true
-fail.workspace = true
 fs2.workspace = true
 git-version.workspace = true
 hex.workspace = true
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -54,19 +54,6 @@ const ID_FILE_NAME: &str = "safekeeper.id";
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-const FEATURES: &[&str] = &[
-    #[cfg(feature = "testing")]
-    "testing",
-];
-
-fn version() -> String {
-    format!(
-        "{GIT_VERSION} failpoints: {}, features: {:?}",
-        fail::has_failpoints(),
-        FEATURES,
-    )
-}
-
 const ABOUT: &str = r#"
 A fleet of safekeepers is responsible for reliably storing WAL received from
 compute, passing it through consensus (mitigating potential computes brain
@@ -180,9 +167,7 @@ async fn main() -> anyhow::Result<()> {
    // getting 'argument cannot be used multiple times' error. This seems to be
    // impossible with pure Derive API, so convert struct to Command, modify it,
    // parse arguments, and then fill the struct back.
-    let cmd = <Args as clap::CommandFactory>::command()
-        .args_override_self(true)
-        .version(version());
+    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
    let mut matches = cmd.get_matches();
    let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;

--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -12,8 +12,6 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
-use tokio_util::sync::CancellationToken;
-use utils::failpoint_support::failpoints_handler;

 use std::io::Write as _;
 use tokio::sync::mpsc;
@@ -446,12 +444,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
        .data(Arc::new(conf))
        .data(auth)
        .get("/v1/status", |r| request_span(r, status_handler))
-        .put("/v1/failpoints", |r| {
-            request_span(r, move |r| async {
-                let cancel = CancellationToken::new();
-                failpoints_handler(r, cancel).await
-            })
-        })
        // Will be used in the future instead of implicit timeline creation
        .post("/v1/tenant/timeline", |r| {
            request_span(r, timeline_create_handler)
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -17,7 +17,6 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
-use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::lsn::AtomicLsn;
 use utils::pageserver_feedback::PageserverFeedback;
@@ -392,8 +391,15 @@ impl SafekeeperPostgresHandler {
        // application_name: give only committed WAL (used by pageserver) or all
        // existing WAL (up to flush_lsn, used by walproposer or peer recovery).
        // The second case is always driven by a consensus leader which term
-        // must be supplied.
-        let end_watch = if term.is_some() {
+        // must generally be also supplied. However we're sloppy to do this in
+        // walproposer recovery which will be removed soon. So TODO is to make
+        // it not Option'al then.
+        //
+        // Fetching WAL without term in recovery creates a small risk of this
+        // WAL getting concurrently garbaged if another compute rises which
+        // collects majority and starts fixing log on this safekeeper itself.
+        // That's ok as (old) proposer will never be able to commit such WAL.
+        let end_watch = if self.is_walproposer_recovery() {
            EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
        } else {
            EndWatch::Commit(tli.get_commit_lsn_watch_rx())
@@ -529,19 +535,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            );

            // try to send as much as available, capped by MAX_SEND_SIZE
-            let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
-            // if we went behind available WAL, back off
-            if chunk_end_pos >= self.end_pos {
-                chunk_end_pos = self.end_pos;
-            } else {
-                // If sending not up to end pos, round down to page boundary to
-                // avoid breaking WAL record not at page boundary, as protocol
-                // demands. See walsender.c (XLogSendPhysical).
-                chunk_end_pos = chunk_end_pos
-                    .checked_sub(chunk_end_pos.block_offset())
-                    .unwrap();
-            }
-            let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
+            let mut send_size = self
+                .end_pos
+                .checked_sub(self.start_pos)
+                .context("reading wal without waiting for it first")?
+                .0 as usize;
+            send_size = min(send_size, self.send_buf.len());
            let send_buf = &mut self.send_buf[..send_size];
            let send_size: usize;
            {
@@ -552,8 +551,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                } else {
                    None
                };
-                // Read WAL into buffer. send_size can be additionally capped to
-                // segment boundary here.
+                // read wal into buffer
                send_size = self.wal_reader.read(send_buf).await?
            };
            let send_buf = &send_buf[..send_size];
@@ -568,11 +566,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                }))
                .await?;

-            if let Some(appname) = &self.appname {
-                if appname == "replica" {
-                    failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
-                }
-            }
            trace!(
                "sent {} bytes of WAL {}-{}",
                send_size,
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -565,9 +565,6 @@ impl WalReader {
        })
    }

-    /// Read WAL at current position into provided buf, returns number of bytes
-    /// read. It can be smaller than buf size only if segment boundary is
-    /// reached.
    pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
        // If this timeline is new, we may not have a full segment yet, so
        // we pad the first bytes of the timeline's first WAL segment with 0s
--- a/scripts/sk_migrate/restart_ep.sh
+++ b/scripts/sk_migrate/restart_ep.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# export NEON_API_KEY=
+
+while IFS= read -r ENDPOINT
+do
+    echo "$ENDPOINT"
+    # curl -X POST -H "Authorization: Bearer $NEON_PROD_KEY" -H "Accept: application/json" -H "Content-Type: application/json"  https://console.neon.tech/regions/console/api/v1/admin/endpoints/$ENDPOINT/restart
+    curl -X POST -H "Authorization: Bearer $NEON_API_KEY" -H "Accept: application/json" -H "Content-Type: application/json"  https://console.neon.tech/regions/aws-us-east-2/api/v1/admin/endpoints/$ENDPOINT/restart
+done < endpoints_cplane.txt
--- a/scripts/sk_migrate/sk_migrate.py
+++ b/scripts/sk_migrate/sk_migrate.py
@@ -0,0 +1,137 @@
+import argparse
+import sys
+import psycopg2
+import psycopg2.extras
+import os
+import requests
+
+def migrate_project(conn, from_sk: dict[str, any], to_sk: dict[str, any], project_id: str, dry_run=True):
+    print("###############################################################")
+
+    with conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor) as cur:
+        cur.execute("SELECT * FROM projects WHERE id = %s", (project_id,))
+        project = cur.fetchone()
+
+    if project is None:
+        print("Project with id {} does not exist".format(project_id))
+        return
+    
+    assert project['deleted'] == False, "Project with id {} is deleted".format(project_id)
+
+    with conn.cursor() as cur:
+        cur.execute("SELECT safekeeper_id FROM projects_safekeepers WHERE project_id = %s", (project_id, ))
+        sk_ids = list(map(lambda x: x[0], cur.fetchall()))
+        assert from_sk['id'] in sk_ids
+        assert to_sk['id'] not in sk_ids
+
+    with conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor) as cur:
+        cur.execute("SELECT * FROM branches WHERE project_id = %s AND deleted = 'f'", (project_id, ))
+        branches = cur.fetchall()
+
+    for branch in branches:
+        if branch['deleted'] != False:
+            continue
+
+        tenant_id = project['tenant_id']
+        timeline_id = branch['timeline_id']
+        print("tenant_id: {}, timeline_id: {}".format(tenant_id, timeline_id))
+        print(f"Migrating from {from_sk['host']} to {to_sk['host']}, project={project_id}, branch={branch['id']}, deleted={branch['deleted']}")
+
+        print(list(sk_ids))
+
+        sk_hosts = list(map(
+            lambda x: f"http://{safekeepers[x]['host']}:{safekeepers[x]['http_port']}",
+            filter(lambda x: x != from_sk['id'], sk_ids)
+        ))
+
+        # make HTTP request to /pull_timeline
+        # url = f"http://{to_sk['host']}:{to_sk['http_port']}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
+        url = f"http://{to_sk['host']}:{to_sk['http_port']}/v1/pull_timeline"
+        body = {
+            "tenant_id": str(tenant_id),
+            "timeline_id": str(timeline_id),
+            "http_hosts": sk_hosts,
+        }
+        print(body)
+
+        print("Making HTTP request to {}".format(url), flush=True)
+        if not dry_run:
+            response = requests.post(url, json=body)
+        # response = requests.get(url)
+
+            if response.status_code != 200 and f"error decoding response body: missing field `tenant_id` at line 1 column 104" in response.text:
+                print(f"WARN: Skipping branch {branch['id']} because it's empty on all safekeepers")
+                continue
+
+            if response.status_code != 200 and f"Timeline {timeline_id} already exists" in response.text:
+                print(f"WARN: Skipping timeline {timeline_id} because it is already exists (was migrated earlier)")
+                continue
+
+            if response.status_code != 200:
+                print("ERROR: {}".format(response.text))
+                return
+            print(response.text)
+
+    print(f"Updating safekeeper {from_sk['id']} -> {to_sk['id']} for project={project_id} in the database")
+    if not dry_run:
+        with conn.cursor() as cur:
+            cur.execute("UPDATE projects_safekeepers SET safekeeper_id = %s WHERE project_id = %s AND safekeeper_id = %s RETURNING *", (to_sk['id'], project_id, from_sk['id']))
+            print(cur.fetchone())
+            conn.commit()
+
+def find_projects(sk_from_id: int):
+    with conn.cursor() as cur:
+        cur.execute("SELECT p.id FROM projects p, projects_safekeepers ps WHERE ps.project_id = p.id AND NOT p.deleted AND ps.safekeeper_id = %s", (sk_from_id, ))
+        project_ids = list(map(lambda x: x[0], cur.fetchall()))
+        return project_ids
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='migrate sk')
+    parser.add_argument("-d", help="database URL", type=str, required=True)
+    parser.add_argument("--from-sk", help="from sk id as in the cplane db", type=int, required=True)
+    parser.add_argument("--to-sk", help="to sk id as in the cplane db", type=int, required=True)
+    parser.add_argument("--not-dry-run", help="", action='store_true')
+    parser.add_argument("--project-id", help="project to migrate", type=str, default=None)
+    args = parser.parse_args()
+
+    # Connect to postgresql database
+    conn = psycopg2.connect(args.d)
+
+    safekeepers = dict()
+
+    # We need to fetch all objects from "safekeepers" table and store them in "safekeepers" list
+    # Create cursor
+    cur = conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
+    # Execute query
+    cur.execute("SELECT * FROM safekeepers")
+    # Fetch all rows
+    rows = cur.fetchall()
+    # Close cursor
+    cur.close()
+
+    # Iterate over rows
+    for row in rows:
+        safekeepers[row['id']] = row
+
+    # Print all safekeepers
+    # print(safekeepers)
+
+    assert args.from_sk in safekeepers, "Safekeeper with id {} does not exist".format(args.from_sk)
+    from_sk_hostname = safekeepers[args.from_sk]['host']
+    assert safekeepers[args.from_sk]['active'] == False, "Safekeeper with id {} should be inactive".format(args.from_sk)
+
+    assert args.to_sk in safekeepers, "Safekeeper with id {} does not exist".format(args.to_sk)
+    to_sk_hostname = safekeepers[args.to_sk]['host']
+    assert safekeepers[args.to_sk]['active'] == True, "Safekeeper with id {} should be active".format(args.to_sk)
+
+    print(f"migrating from id {args.from_sk} {from_sk_hostname} to {args.to_sk} {to_sk_hostname}")
+
+    if args.project_id is not None:
+        project_ids = [args.project_id]
+    else:
+        project_ids = find_projects(args.from_sk)
+    print(project_ids)
+
+    for project_id in project_ids:
+        migrate_project(conn, safekeepers[args.from_sk], safekeepers[args.to_sk], project_id)
--- a/Show More
+++ b/Show More