diff --git a/.config/nextest.toml b/.config/nextest.toml
new file mode 100644
index 0000000000..8bccd51c6d
--- /dev/null
+++ b/.config/nextest.toml
@@ -0,0 +1,2 @@
+[profile.default]
+slow-timeout = "1m"
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index e2f15d96db..8bf12c31b1 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,7 @@ on:
     #          │ │ ┌───────────── day of the month (1 - 31)
     #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
     #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '0 3 * * *' # run once a day, timezone is utc
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
 
   workflow_dispatch: # adds ability to run this manually
     inputs:
@@ -23,6 +23,21 @@ on:
         type: boolean
         description: 'Publish perf report. If not set, the report will be published only for the main branch'
         required: false
+      collect_olap_explain:
+        type: boolean
+        description: 'Collect EXPLAIN ANALYZE for OLAP queries. If not set, EXPLAIN ANALYZE will not be collected'
+        required: false
+        default: false
+      collect_pg_stat_statements:
+        type: boolean
+        description: 'Collect pg_stat_statements for OLAP queries. If not set, pg_stat_statements will not be collected'
+        required: false
+        default: false
+      run_AWS_RDS_AND_AURORA:
+        type: boolean
+        description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
+        required: false
+        default: false
 
 defaults:
   run:
@@ -113,6 +128,8 @@ jobs:
     # - neon-captest-reuse: Reusing existing project
     # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
     # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+    env:
+      RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
     runs-on: ubuntu-latest
     outputs:
       pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -152,7 +169,7 @@ jobs:
           ]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
                                                    { "platform": "rds-aurora"   }]')
         fi
@@ -171,9 +188,9 @@ jobs:
           ]
         }'
 
-        if [ "$(date +%A)" = "Saturday" ]; then
+        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
           matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                   { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
         fi
 
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -337,6 +354,8 @@ jobs:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
       DEFAULT_PG_VERSION: 14
       TEST_OUTPUT: /tmp/test_output
+      TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
+      TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
       BUILD_TYPE: remote
       SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
       PLATFORM: ${{ matrix.platform }}
@@ -399,6 +418,8 @@ jobs:
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain || 'false' }}
+        TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements || 'false' }}
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         TEST_OLAP_SCALE: 10
 
diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml
new file mode 100644
index 0000000000..e401b2f418
--- /dev/null
+++ b/.github/workflows/build_and_push_docker_image.yml
@@ -0,0 +1,105 @@
+name: Build and Push Docker Image
+
+on:
+  workflow_call:
+    inputs:
+      dockerfile-path:
+        required: true
+        type: string
+      image-name:
+        required: true
+        type: string
+    outputs:
+      build-tools-tag:
+        description: "tag generated for build tools"
+        value: ${{ jobs.tag.outputs.build-tools-tag }}
+
+jobs:
+  check-if-build-tools-dockerfile-changed:
+    runs-on: ubuntu-latest
+    outputs:
+      docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }}
+    steps:
+      - name: Check if Dockerfile.buildtools has changed
+        id: dockerfile
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then
+            echo "docker_file_changed=false" >> $GITHUB_OUTPUT
+            exit
+          fi
+          updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only)
+          if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then
+            echo "docker_file_changed=true" >> $GITHUB_OUTPUT
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  tag:
+    runs-on: ubuntu-latest
+    needs: [ check-if-build-tools-dockerfile-changed ]
+    outputs:
+      build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}}
+
+    steps:
+      - name: Get buildtools tag
+        env:
+          DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }}
+        run: |
+          if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then
+            IMAGE_TAG=$GITHUB_RUN_ID
+          else
+            IMAGE_TAG=pinned
+          fi
+
+          echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+        shell: bash
+        id: buildtools-tag
+
+  kaniko:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, x64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64
+
+  kaniko-arm:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    needs: [ tag, check-if-build-tools-dockerfile-changed ]
+    runs-on: [ self-hosted, dev, arm64 ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v1
+
+      - name: Configure ECR login
+        run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
+
+      - name: Kaniko build
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+  manifest:
+    if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true'
+    name: 'manifest'
+    runs-on: [ self-hosted, dev, x64 ]
+    needs:
+      - tag
+      - kaniko
+      - kaniko-arm
+      - check-if-build-tools-dockerfile-changed
+
+    steps:
+      - name: Create manifest
+        run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64
+
+      - name: Push manifest
+        run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 693ed1a66f..78deff6e85 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -44,7 +44,6 @@ jobs:
 
         exit 1
 
-
   tag:
     needs: [ check-permissions ]
     runs-on: [ self-hosted, gen3, small ]
@@ -74,11 +73,19 @@ jobs:
         shell: bash
         id: build-tag
 
-  check-codestyle-python:
+  build-buildtools-image:
     needs: [ check-permissions ]
+    uses: ./.github/workflows/build_and_push_docker_image.yml
+    with:
+      dockerfile-path: Dockerfile.buildtools
+      image-name: build-tools
+    secrets: inherit
+
+  check-codestyle-python:
+    needs: [ check-permissions, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -108,10 +115,10 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -175,10 +182,10 @@ jobs:
         run: cargo deny check --hide-inclusion-graph
 
   build-neon:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, tag, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -332,16 +339,16 @@ jobs:
         run: |
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
-      - name: Run cargo test
+      - name: Run rust tests
         run: |
-          ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
 
           # Run separate tests for real Azure Blob Storage
           # XXX: replace region with `eu-central-1`-like region
@@ -351,7 +358,7 @@ jobs:
           export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
           export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
 
       - name: Install rust binaries
         run: |
@@ -408,10 +415,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    needs: [ check-permissions, build-neon, tag ]
+    needs: [ check-permissions, build-neon, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     strategy:
@@ -447,10 +454,10 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       # Default shared memory is 64mb
       options: --init --shm-size=512mb
     if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
@@ -479,12 +486,12 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   create-test-report:
-    needs: [ check-permissions, regress-tests, coverage-report, benchmarks ]
+    needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ]
     if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
 
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
 
     steps:
@@ -526,11 +533,10 @@ jobs:
             })
 
   coverage-report:
-    needs: [ check-permissions, regress-tests ]
-
+    needs: [ check-permissions, regress-tests, build-buildtools-image ]
     runs-on: [ self-hosted, gen3, small ]
     container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
       options: --init
     strategy:
       fail-fast: false
@@ -694,7 +700,7 @@ jobs:
             }"
 
   neon-image:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
@@ -733,6 +739,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+                           --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                            --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
@@ -743,7 +750,7 @@ jobs:
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     container: gcr.io/kaniko-project/executor:v1.9.2-debug
     defaults:
       run:
@@ -778,6 +785,7 @@ jobs:
                            --context .
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-tools
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -788,7 +796,7 @@ jobs:
         run: rm -rf ~/.ecr
 
   compute-node-image:
-    needs: [ check-permissions, tag ]
+    needs: [ check-permissions, build-buildtools-image, tag ]
     runs-on: [ self-hosted, gen3, large ]
     container:
       image: gcr.io/kaniko-project/executor:v1.9.2-debug
@@ -836,6 +844,7 @@ jobs:
                            --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
                            --build-arg PG_VERSION=${{ matrix.version }}
                            --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
+                           --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}}
                            --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                            --dockerfile Dockerfile.compute-node
                            --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
@@ -857,7 +866,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.19.0
+      VM_BUILDER_VERSION: v0.21.0
 
     steps:
       - name: Checkout
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index b1ea5e4f74..c6c2b7386a 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -218,7 +218,7 @@ jobs:
 
           # Run separate tests for real S3
           export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
           cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml
new file mode 100644
index 0000000000..88bab797b7
--- /dev/null
+++ b/.github/workflows/update_build_tools_image.yml
@@ -0,0 +1,130 @@
+name: 'Update build tools image tag'
+
+# This workflow it used to update tag of build tools in ECR.
+# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image.
+
+on:
+  workflow_dispatch:
+    inputs:
+      from-tag:
+        description: 'Source tag'
+        required: true
+        type: string
+      to-tag:
+        description: 'Destination tag'
+        required: true
+        type: string
+        default: 'pinned'
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+permissions: {}
+
+jobs:
+  tag-image:
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+    outputs:
+      next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }}
+      prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Get source image digest
+        id: next-digest
+        run: |
+          NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true)
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist"
+            exit 1
+          fi
+
+          echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}"
+          echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT
+
+      - name: Get destination image digest (if already exists)
+        id: prev-digest
+        run: |
+          PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true)
+          if [ -z "${PREV_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)"
+          else
+            echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}"
+
+            echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Tag image
+        run: |
+          crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}"
+
+  rollback-tag-image:
+    needs:  tag-image
+    if: ${{ !success() }}
+
+    runs-on: [ self-hosted, gen3, small ]
+    container: golang:1.19-bullseye
+
+    env:
+      IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools
+      FROM_TAG: ${{ inputs.from-tag }}
+      TO_TAG: ${{ inputs.to-tag }}
+
+    steps:
+      - name: Install Crane & ECR helper
+        run: |
+          go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Restore previous tag if needed
+        run: |
+          NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}"
+          PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}"
+
+          if [ -z "${NEXT_DIGEST}" ]; then
+            echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback"
+            exit 0
+          fi
+
+          if [ -z "${PREV_DIGEST}" ]; then
+            # I guess we should delete the tag here/untag the image, but crane does not support it
+            # - https://github.com/google/go-containerregistry/issues/999
+
+            echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback"
+
+            exit 0
+          fi
+
+          CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}")
+          if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then
+            crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}"
+
+            echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}"
+          else
+            echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored"
+          fi
diff --git a/.gitignore b/.gitignore
index c5fc121ac2..3f4495c9e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 test_output/
 .vscode
 .idea
+neon.iml
 /.neon
 /integration_tests/.neon
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2692684006..b318c295a3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -70,3 +70,17 @@ We're using the following approach to make it work:
 - The label gets removed automatically, so to run CI again with new changes, the label should be added again (after the review)
 
 For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml)
+
+## How do I add the "pinned" tag to an buildtools image?
+We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation.
+
+You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml,
+or using GitHub CLI:
+
+```bash
+gh workflow -R neondatabase/neon run update_build_tools_image.yml \
+            -f from-tag=6254913013 \
+            -f to-tag=pinned \
+
+# Default `-f to-tag` is `pinned`, so the parameter can be omitted.
+```
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index f931fd6c29..abd87dc0da 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -190,9 +190,9 @@ dependencies = [
 
 [[package]]
 name = "async-compression"
-version = "0.4.0"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11"
+checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
 dependencies = [
  "flate2",
  "futures-core",
@@ -1168,6 +1168,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest",
+ "rust-ini",
  "serde",
  "serde_json",
  "tar",
@@ -1201,6 +1202,26 @@ version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
 
+[[package]]
+name = "const-random"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
+dependencies = [
+ "const-random-macro",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom 0.2.11",
+ "once_cell",
+ "tiny-keccak",
+]
+
 [[package]]
 name = "const_fn"
 version = "0.4.9"
@@ -1433,6 +1454,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
 [[package]]
 name = "crypto-bigint"
 version = "0.4.9"
@@ -1575,6 +1602,15 @@ dependencies = [
  "syn 2.0.32",
 ]
 
+[[package]]
+name = "dlv-list"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
+dependencies = [
+ "const-random",
+]
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2106,6 +2142,20 @@ dependencies = [
  "hashbrown 0.13.2",
 ]
 
+[[package]]
+name = "hdrhistogram"
+version = "7.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
+dependencies = [
+ "base64 0.21.1",
+ "byteorder",
+ "crossbeam-channel",
+ "flate2",
+ "nom",
+ "num-traits",
+]
+
 [[package]]
 name = "heapless"
 version = "0.8.0"
@@ -2487,13 +2537,14 @@ dependencies = [
 
 [[package]]
 name = "jsonwebtoken"
-version = "8.3.0"
+version = "9.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
+checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
  "base64 0.21.1",
- "pem 1.1.1",
- "ring 0.16.20",
+ "js-sys",
+ "pem 3.0.3",
+ "ring 0.17.6",
  "serde",
  "serde_json",
  "simple_asn1",
@@ -3028,6 +3079,16 @@ dependencies = [
  "tokio-stream",
 ]
 
+[[package]]
+name = "ordered-multimap"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f"
+dependencies = [
+ "dlv-list",
+ "hashbrown 0.14.0",
+]
+
 [[package]]
 name = "os_info"
 version = "3.7.0"
@@ -3056,6 +3117,28 @@ dependencies = [
  "sha2",
 ]
 
+[[package]]
+name = "pagebench"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "hdrhistogram",
+ "humantime",
+ "humantime-serde",
+ "pageserver",
+ "pageserver_api",
+ "pageserver_client",
+ "rand 0.8.5",
+ "serde",
+ "serde_json",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "pagectl"
 version = "0.1.0"
@@ -3145,6 +3228,7 @@ dependencies = [
  "tokio",
  "tokio-io-timeout",
  "tokio-postgres",
+ "tokio-stream",
  "tokio-tar",
  "tokio-util",
  "toml_edit",
@@ -3182,11 +3266,19 @@ dependencies = [
 name = "pageserver_client"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
  "async-trait",
+ "bytes",
+ "futures",
  "pageserver_api",
+ "postgres",
  "reqwest",
  "serde",
  "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-stream",
+ "tokio-util",
  "utils",
  "workspace_hack",
 ]
@@ -3282,18 +3374,19 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
 [[package]]
 name = "pem"
-version = "1.1.1"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8835c273a76a90455d7344889b0964598e3316e2a79ede8e36f16bdcf2228b8"
+checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
 dependencies = [
- "base64 0.13.1",
+ "base64 0.21.1",
+ "serde",
 ]
 
 [[package]]
 name = "pem"
-version = "2.0.1"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a"
+checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
 dependencies = [
  "base64 0.21.1",
  "serde",
@@ -4169,6 +4262,16 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "rust-ini"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e0698206bcb8882bf2a9ecb4c1e7785db57ff052297085a6efd4fe42302068a"
+dependencies = [
+ "cfg-if",
+ "ordered-multimap",
+]
+
 [[package]]
 name = "rustc-demangle"
 version = "0.1.23"
@@ -4419,12 +4522,12 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
 [[package]]
 name = "sct"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4"
+checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
+ "ring 0.17.6",
+ "untrusted 0.9.0",
 ]
 
 [[package]]
@@ -5123,6 +5226,15 @@ dependencies = [
  "time-core",
 ]
 
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -6290,6 +6402,7 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "futures-util",
+ "getrandom 0.2.11",
  "hex",
  "hmac",
  "hyper",
@@ -6301,6 +6414,7 @@ dependencies = [
  "num-bigint",
  "num-integer",
  "num-traits",
+ "once_cell",
  "prost",
  "rand 0.8.5",
  "regex",
@@ -6403,30 +6517,28 @@ checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
 
 [[package]]
 name = "zstd"
-version = "0.12.4"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
 dependencies = [
  "zstd-safe",
 ]
 
 [[package]]
 name = "zstd-safe"
-version = "6.0.6"
+version = "7.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e"
 dependencies = [
- "libc",
  "zstd-sys",
 ]
 
 [[package]]
 name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
+version = "2.0.9+zstd.1.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656"
 dependencies = [
  "cc",
- "libc",
  "pkg-config",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index b44544d626..5de636778a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,6 +6,7 @@ members = [
     "pageserver",
     "pageserver/ctl",
     "pageserver/client",
+    "pageserver/pagebench",
     "proxy",
     "safekeeper",
     "storage_broker",
@@ -79,6 +80,7 @@ futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
 hashlink = "0.8.1"
+hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
@@ -91,7 +93,7 @@ hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
-jsonwebtoken = "8"
+jsonwebtoken = "9"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
diff --git a/Dockerfile b/Dockerfile
index 60de9cfa3e..5d5fde4f14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@
 ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
 ### inside this image in the real deployments.
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 
 # Build Postgres
diff --git a/Dockerfile.buildtools b/Dockerfile.buildtools
new file mode 100644
index 0000000000..c2fcd8841e
--- /dev/null
+++ b/Dockerfile.buildtools
@@ -0,0 +1,166 @@
+FROM debian:bullseye-slim
+
+# Add nonroot user
+RUN useradd -ms /bin/bash nonroot -b /home
+SHELL ["/bin/bash", "-c"]
+
+# System deps
+RUN set -e \
+    && apt update \
+    && apt install -y \
+        autoconf \
+        automake \
+        bison \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        flex \
+        git \
+        gnupg \
+        gzip \
+        jq \
+        libcurl4-openssl-dev \
+        libbz2-dev \
+        libffi-dev \
+        liblzma-dev \
+        libncurses5-dev \
+        libncursesw5-dev \
+        libpq-dev \
+        libreadline-dev \
+        libseccomp-dev \
+        libsqlite3-dev \
+        libssl-dev \
+        libstdc++-10-dev \
+        libtool \
+        libxml2-dev \
+        libxmlsec1-dev \
+        libxxhash-dev \
+        lsof \
+        make \
+        netcat \
+        net-tools \
+        openssh-client \
+        parallel \
+        pkg-config \
+        unzip \
+        wget \
+        xz-utils \
+        zlib1g-dev \
+        zstd \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# protobuf-compiler (protoc)
+ENV PROTOC_VERSION 25.1
+RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
+    && unzip -q protoc.zip -d protoc \
+    && mv protoc/bin/protoc /usr/local/bin/protoc \
+    && mv protoc/include/google /usr/local/include/google \
+    && rm -rf protoc.zip protoc
+
+# LLVM
+ENV LLVM_VERSION=17
+RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
+    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && apt update \
+    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
+    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# PostgreSQL 14
+RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
+    && echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
+    && apt update \
+    && apt install -y postgresql-client-14 \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
+    && unzip -q awscliv2.zip \
+    && ./aws/install \
+    && rm awscliv2.zip
+
+# Mold: A Modern Linker
+ENV MOLD_VERSION v2.4.0
+RUN set -e \
+    && git clone https://github.com/rui314/mold.git \
+    && mkdir mold/build \
+    && cd mold/build \
+    && git checkout ${MOLD_VERSION} \
+    && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang++ .. \
+    && cmake --build . -j $(nproc) \
+    && cmake --install . \
+    && cd .. \
+    && rm -rf mold
+
+# LCOV
+# Build lcov from a fork:
+# It includes several bug fixes on top on v2.0 release (https://github.com/linux-test-project/lcov/compare/v2.0...master)
+# And patches from us:
+# - Generates json file with code coverage summary (https://github.com/neondatabase/lcov/commit/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz)
+RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JSON::XS Memory::Process Time::HiRes JSON; do yes | perl -MCPAN -e "CPAN::Shell->notest('install', '$package')"; done \
+    && wget https://github.com/neondatabase/lcov/archive/426e7e7a22f669da54278e9b55e6d8caabd00af0.tar.gz -O lcov.tar.gz \
+    && echo "61a22a62e20908b8b9e27d890bd0ea31f567a7b9668065589266371dcbca0992  lcov.tar.gz" | sha256sum --check \
+    && mkdir -p lcov && tar -xzf lcov.tar.gz -C lcov --strip-components=1 \
+    && cd lcov \
+    && make install \
+    && rm -rf ../lcov.tar.gz
+
+# Switch to nonroot user
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Python
+ENV PYTHON_VERSION=3.9.2 \
+    PYENV_ROOT=/home/nonroot/.pyenv \
+    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
+RUN set -e \
+    && cd $HOME \
+    && curl -sSO https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer \
+    && chmod +x pyenv-installer \
+    && ./pyenv-installer \
+    && export PYENV_ROOT=/home/nonroot/.pyenv \
+    && export PATH="$PYENV_ROOT/bin:$PATH" \
+    && export PATH="$PYENV_ROOT/shims:$PATH" \
+    && pyenv install ${PYTHON_VERSION} \
+    && pyenv global ${PYTHON_VERSION} \
+    && python --version \
+    && pip install --upgrade pip \
+    && pip --version \
+    && pip install pipenv wheel poetry
+
+# Switch to nonroot user (again)
+USER nonroot:nonroot
+WORKDIR /home/nonroot
+
+# Rust
+# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
+ENV RUSTC_VERSION=1.74.0
+ENV RUSTUP_HOME="/home/nonroot/.rustup"
+ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
+	chmod +x rustup-init && \
+	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
+	rm rustup-init && \
+    export PATH="$HOME/.cargo/bin:$PATH" && \
+    . "$HOME/.cargo/env" && \
+    cargo --version && rustup --version && \
+    rustup component add llvm-tools-preview rustfmt clippy && \
+    cargo install --git https://github.com/paritytech/cachepot && \
+    cargo install rustfilt && \
+    cargo install cargo-hakari && \
+    cargo install cargo-deny && \
+    cargo install cargo-hack && \
+    cargo install cargo-nextest && \
+    rm -rf /home/nonroot/.cargo/registry && \
+    rm -rf /home/nonroot/.cargo/git
+ENV RUSTC_WRAPPER=cachepot
+
+# Show versions
+RUN whoami \
+    && python --version \
+    && pip --version \
+    && cargo --version --verbose \
+    && rustup --version --verbose \
+    && rustc --version --verbose \
+    && clang --version
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 03280586f8..14ba1b5b9a 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -1,6 +1,6 @@
 ARG PG_VERSION
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 
@@ -48,7 +48,29 @@ RUN cd postgres && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
+    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
+    # In vanilla postgres this function is limited to Postgres role superuser.
+    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
+    # We could add the additional grant statements to the postgres repository but it would be hard to maintain, 
+    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
+    # so we do it here.
+    old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \
+    # the first loop is for pg_stat_statement extension version <= 1.6
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if echo "$old_list" | grep -q -F "$filename"; then \
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
+        fi; \
+    done; \
+    # the second loop is for pg_stat_statement extension versions >= 1.7, 
+    # where pg_stat_statement_reset() got 3 additional arguments
+    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
+        filename=$(basename "$file"); \
+        if ! echo "$old_list" | grep -q -F "$filename"; then \
+            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
+        fi; \
+    done      
 
 #########################################################################################
 #
@@ -569,6 +591,23 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-semver-pg-build"
+# compile pg_semver extension
+#
+#########################################################################################
+FROM build-deps AS pg-semver-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
+    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
+    mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control
+
 #########################################################################################
 #
 # Layer "pg-embedding-pg-build"
@@ -768,6 +807,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/
diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools
index 3066e3f7ca..cc305cc556 100644
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,7 @@
 # First transient image to build compute_tools binaries
 # NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
 ARG REPOSITORY=neondatabase
-ARG IMAGE=rust
+ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
 
diff --git a/README.md b/README.md
index 3e3123f5ee..98af1edee6 100644
--- a/README.md
+++ b/README.md
@@ -29,13 +29,14 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
+libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
   libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
+  protobuf-devel libcurl-devel openssl poetry lsof libicu-devel libpq-devel python3-devel \
+  libffi-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 47378f1910..142fa08495 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -37,5 +37,6 @@ workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
-zstd = "0.12.4"
+zstd = "0.13"
 bytes = "1.0"
+rust-ini = "0.20.0"
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index ce7345d5be..436db59088 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,9 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r http://pg-ext-s3-gateway
+//!             -r http://pg-ext-s3-gateway \
+//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
+//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
 //! ```
 //!
 use std::collections::HashMap;
@@ -99,6 +101,9 @@ fn main() -> Result<()> {
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
 
+    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
+    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
+
     // Extract OpenTelemetry context for the startup actions from the
     // TRACEPARENT and TRACESTATE env variables, and attach it to the current
     // tracing context.
@@ -209,6 +214,8 @@ fn main() -> Result<()> {
         ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
         ext_download_progress: RwLock::new(HashMap::new()),
         build_tag,
+        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
+        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
     };
     let compute = Arc::new(compute_node);
 
@@ -493,6 +500,23 @@ fn cli() -> clap::Command {
                 )
                 .value_name("FILECACHE_CONNSTR"),
         )
+        .arg(
+            Arg::new("pgbouncer-connstr")
+                .long("pgbouncer-connstr")
+                .default_value(
+                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
+                )
+                .value_name("PGBOUNCER_CONNSTR"),
+        )
+        .arg(
+            Arg::new("pgbouncer-ini-path")
+                .long("pgbouncer-ini-path")
+                // Note: this doesn't match current path for pgbouncer.ini.
+                // Until we fix it, we need to pass the path explicitly
+                // or this will be effectively no-op.
+                .default_value("/etc/pgbouncer.ini")
+                .value_name("PGBOUNCER_INI_PATH"),
+        )
 }
 
 #[test]
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index b39a800f14..cd7be0520e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -7,6 +7,7 @@ use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
 use std::sync::{Condvar, Mutex, RwLock};
+use std::thread;
 use std::time::Instant;
 
 use anyhow::{Context, Result};
@@ -64,6 +65,10 @@ pub struct ComputeNode {
     // key: ext_archive_name, value: started download time, download_completed?
     pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
     pub build_tag: String,
+    // connection string to pgbouncer to change settings
+    pub pgbouncer_connstr: Option<String>,
+    // path to pgbouncer.ini to change settings
+    pub pgbouncer_ini_path: Option<String>,
 }
 
 // store some metrics about download size that might impact startup time
@@ -737,6 +742,31 @@ impl ComputeNode {
     pub fn reconfigure(&self) -> Result<()> {
         let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
 
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);
+
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .expect("failed to create rt");
+
+            // Spawn a thread to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let _handle = thread::spawn(move || {
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                }
+            });
+        }
+
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -791,6 +821,32 @@ impl ComputeNode {
             pspec.timeline_id,
         );
 
+        // tune pgbouncer
+        if let Some(connstr) = &self.pgbouncer_connstr {
+            info!("tuning pgbouncer with connstr: {:?}", connstr);
+
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .expect("failed to create rt");
+
+            // Spawn a thread to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
+            let connstr_clone = connstr.clone();
+            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let _handle = thread::spawn(move || {
+                let res = rt.block_on(tune_pgbouncer(
+                    pgbouncer_settings,
+                    &connstr_clone,
+                    pgbouncer_ini_path,
+                ));
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                }
+            });
+        }
+
         info!(
             "start_compute spec.remote_extensions {:?}",
             pspec.spec.remote_extensions
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index b79e516650..0b0e137c03 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -9,9 +9,11 @@ use std::process::Child;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Result};
+use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use tracing::{debug, instrument};
+use tokio_postgres::NoTls;
+use tracing::{debug, error, info, instrument};
 
 use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
 
@@ -359,3 +361,68 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
 
     Ok(())
 }
+
+/// Update pgbouncer.ini with provided options
+pub fn update_pgbouncer_ini(
+    pgbouncer_config: HashMap<String, String>,
+    pgbouncer_ini_path: &str,
+) -> Result<()> {
+    let mut conf = Ini::load_from_file(pgbouncer_ini_path)?;
+    let section = conf.section_mut(Some("pgbouncer")).unwrap();
+
+    for (option_name, value) in pgbouncer_config.iter() {
+        section.insert(option_name, value);
+    }
+
+    conf.write_to_file(pgbouncer_ini_path)?;
+    Ok(())
+}
+
+/// Tune pgbouncer.
+/// 1. Apply new config using pgbouncer admin console
+/// 2. Add new values to pgbouncer.ini to preserve them after restart
+pub async fn tune_pgbouncer(
+    pgbouncer_settings: Option<HashMap<String, String>>,
+    pgbouncer_connstr: &str,
+    pgbouncer_ini_path: Option<String>,
+) -> Result<()> {
+    if let Some(pgbouncer_config) = pgbouncer_settings {
+        // Apply new config
+        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
+        let (client, connection) = connect_result.unwrap();
+        tokio::spawn(async move {
+            if let Err(e) = connection.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+
+        for (option_name, value) in pgbouncer_config.iter() {
+            info!(
+                "Applying pgbouncer setting change: {} = {}",
+                option_name, value
+            );
+            let query = format!("SET {} = {}", option_name, value);
+
+            let result = client.simple_query(&query).await;
+
+            info!("Applying pgbouncer setting change: {}", query);
+            info!("pgbouncer setting change result: {:?}", result);
+
+            if let Err(err) = result {
+                // Don't fail on error, just print it into log
+                error!(
+                    "Failed to apply pgbouncer setting change: {},  {}",
+                    query, err
+                );
+            };
+        }
+
+        // save values to pgbouncer.ini
+        // so that they are preserved after pgbouncer restart
+        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
+            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 20299c8fde..d545858dc2 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -370,33 +370,49 @@ pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Cli
     Ok(())
 }
 
+fn reassign_owned_objects_in_one_db(
+    conf: Config,
+    role_name: &PgIdent,
+    db_owner: &PgIdent,
+) -> Result<()> {
+    let mut client = conf.connect(NoTls)?;
+
+    // This will reassign all dependent objects to the db owner
+    let reassign_query = format!(
+        "REASSIGN OWNED BY {} TO {}",
+        role_name.pg_quote(),
+        db_owner.pg_quote()
+    );
+    info!(
+        "reassigning objects owned by '{}' in db '{}' to '{}'",
+        role_name,
+        conf.get_dbname().unwrap_or(""),
+        db_owner
+    );
+    client.simple_query(&reassign_query)?;
+
+    // This now will only drop privileges of the role
+    let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
+    client.simple_query(&drop_query)?;
+    Ok(())
+}
+
 // Reassign all owned objects in all databases to the owner of the database.
 fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
     for db in &spec.cluster.databases {
         if db.owner != *role_name {
             let mut conf = Config::from_str(connstr)?;
             conf.dbname(&db.name);
-
-            let mut client = conf.connect(NoTls)?;
-
-            // This will reassign all dependent objects to the db owner
-            let reassign_query = format!(
-                "REASSIGN OWNED BY {} TO {}",
-                role_name.pg_quote(),
-                db.owner.pg_quote()
-            );
-            info!(
-                "reassigning objects owned by '{}' in db '{}' to '{}'",
-                role_name, &db.name, &db.owner
-            );
-            client.simple_query(&reassign_query)?;
-
-            // This now will only drop privileges of the role
-            let drop_query = format!("DROP OWNED BY {}", role_name.pg_quote());
-            client.simple_query(&drop_query)?;
+            reassign_owned_objects_in_one_db(conf, role_name, &db.owner)?;
         }
     }
 
+    // Also handle case when there are no databases in the spec.
+    // In this case we need to reassign objects in the default database.
+    let conf = Config::from_str(connstr)?;
+    let db_owner = PgIdent::from_str("cloud_admin")?;
+    reassign_owned_objects_in_one_db(conf, role_name, &db_owner)?;
+
     Ok(())
 }
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 071f22dc2b..55b66742ca 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -537,6 +537,7 @@ impl Endpoint {
             safekeeper_connstrings,
             storage_auth_token: auth_token.clone(),
             remote_extensions,
+            pgbouncer_settings: None,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/deny.toml b/deny.toml
index 079dcac679..22e39a2ca3 100644
--- a/deny.toml
+++ b/deny.toml
@@ -35,6 +35,7 @@ allow = [
     "Artistic-2.0",
     "BSD-2-Clause",
     "BSD-3-Clause",
+    "CC0-1.0",
     "ISC",
     "MIT",
     "MPL-2.0",
diff --git a/docs/rfcs/029-getpage-throttling.md b/docs/rfcs/029-getpage-throttling.md
new file mode 100644
index 0000000000..b4f9adefc5
--- /dev/null
+++ b/docs/rfcs/029-getpage-throttling.md
@@ -0,0 +1,197 @@
+# Per-Tenant GetPage@LSN Throttling
+
+Author: Christian Schwarz
+Date: Oct 24, 2023
+
+## Summary
+
+This RFC proposes per-tenant throttling of GetPage@LSN requests inside Pageserver
+and the interactions with its client, i.e., the neon_smgr component in Compute.
+
+The result of implementing & executing this RFC will be a fleet-wide upper limit for
+**"the highest GetPage/second that Pageserver can support for a single tenant/shard"**.
+
+## Background
+
+### GetPage@LSN Request Flow
+
+Pageserver exposes its `page_service.rs` as a libpq listener.
+The Computes' `neon_smgr` module connects to that libpq listener.
+Once a connection is established, the protocol allows Compute to request page images at a given LSN.
+We call these requests GetPage@LSN requests, or GetPage requests for short.
+Other request types can be sent, but these are low traffic compared to GetPage requests
+and are not the concern of this RFC.
+
+Pageserver associates one libpq connection with one tokio task.
+
+Per connection/task, the pq protocol is handled by the common `postgres_backend` crate.
+Its `run_message_loop` function invokes the `page_service` specific `impl<IO> postgres_backend::Handler<IO> for PageServerHandler`.
+Requests are processed in the order in which they arrive via the TCP-based pq protocol.
+So, there is no concurrent request processing within one connection/task.
+
+There is a degree of natural pipelining:
+Compute can "fill the pipe" by sending more than one GetPage request into the libpq TCP stream.
+And Pageserver can fill the pipe with responses in the other direction.
+Both directions are subject to the limit of tx/rx buffers, nodelay, TCP flow control, etc.
+
+### GetPage@LSN Access Pattern
+
+The Compute has its own hierarchy of caches, specifically `shared_buffers` and the `local file cache` (LFC).
+Compute only issues GetPage requests to Pageserver if it encounters a miss in these caches.
+
+If the working set stops fitting into Compute's caches, requests to Pageserver increase sharply -- the Compute starts *thrashing*.
+
+## Motivation
+
+In INC-69, a tenant issued 155k GetPage/second for a period of 10 minutes and 60k GetPage/second for a period of 3h,
+then dropping to ca 18k GetPage/second for a period of 9h.
+
+We noticed this because of an internal GetPage latency SLO burn rate alert, i.e.,
+the request latency profile during this period significantly exceeded what was acceptable according to the internal SLO.
+
+Sadly, we do not have the observability data to determine the impact of this tenant on other tenants on the same tenants.
+
+However, here are some illustrative data points for the 155k period:
+The tenant was responsible for >= 99% of the GetPage traffic and, frankly, the overall activity on this Pageserver instance.
+We were serving pages at 10 Gb/s (`155k x 8 kbyte (PAGE_SZ) per second is 1.12GiB/s = 9.4Gb/s.`)
+The CPU utilization of the instance was 75% user+system.
+Pageserver page cache served 1.75M accesses/second at a hit rate of ca 90%.
+The hit rate for materialized pages was ca. 40%.
+Curiously, IOPS to the Instance Store NVMe were very low, rarely exceeding 100.
+
+The fact that the IOPS were so low / the materialized page cache hit rate was so high suggests that **this tenant's compute's caches were thrashing**.
+The compute was of type `k8s-pod`; hence, auto-scaling could/would not have helped remediate the thrashing by provisioning more RAM.
+The consequence was that the **thrashing translated into excessive GetPage requests against Pageserver**.
+
+My claim is that it was **unhealthy to serve this workload at the pace we did**:
+* it is likely that other tenants were/would have experienced high latencies (again, we sadly don't have per-tenant latency data to confirm this)
+* more importantly, it was **unsustainable** to serve traffic at this pace for multiple reasons:
+    * **predictability of performance**: when the working set grows, the pageserver materialized page cache hit rate drops.
+      At some point, we're bound by the EC2 Instance Store NVMe drive's IOPS limit.
+      The result is an **uneven** performance profile from the Compute perspective.
+
+    * **economics**: Neon currently does not charge for IOPS, only capacity.
+      **We cannot afford to undercut the market in IOPS/$ this drastically; it leads to adverse selection and perverse incentives.**
+      For example, the 155k IOPS, which we served for 10min, would cost ca. 6.5k$/month when provisioned as an io2 EBS volume.
+      Even the 18k IOPS, which we served for 9h, would cost ca. 1.1k$/month when provisioned as an io2 EBS volume.
+      We charge 0$.
+      It could be economically advantageous to keep using a low-DRAM compute because Pageserver IOPS are fast enough and free.
+
+
+Note: It is helpful to think of Pageserver as a disk, because it's precisely where `neon_smgr` sits:
+vanilla Postgres gets its pages from disk, Neon Postgres gets them from Pageserver.
+So, regarding the above performance & economic arguments, it is fair to say that we currently provide an "as-fast-as-possible-IOPS" disk that we charge for only by capacity.
+
+## Solution: Throttling GetPage Requests
+
+**The consequence of the above analysis must be that Pageserver throttles GetPage@LSN requests**.
+That is, unless we want to start charging for provisioned GetPage@LSN/second.
+Throttling sets the correct incentive for a thrashing Compute to scale up its DRAM to the working set size.
+Neon Autoscaling will make this easy, [eventually](https://github.com/neondatabase/neon/pull/3913).
+
+## The Design Space
+
+What that remains is the question about *policy* and *mechanism*:
+
+**Policy** concerns itself with the question of what limit applies to a given connection|timeline|tenant.
+Candidates are:
+
+* hard limit, same limit value per connection|timeline|tenant
+    * Per-tenant will provide an upper bound for the impact of a tenant on a given Pageserver instance.
+      This is a major operational pain point / risk right now.
+* hard limit, configurable per connection|timeline|tenant
+    * This outsources policy to console/control plane, with obvious advantages for flexible structuring of what service we offer to customers.
+    * Note that this is not a mechanism to guarantee a minium provisioned rate, i.e., this is not a mechanism to guarantee a certain QoS for a tenant.
+* fair share among active connections|timelines|tenants per instance
+    * example: each connection|timeline|tenant gets a fair fraction of the machine's GetPage/second capacity
+    * NB: needs definition of "active", and knowledge of available GetPage/second capacity in advance
+* ...
+
+
+Regarding **mechanism**, it's clear that **backpressure** is the way to go.
+However, we must choose between
+* **implicit** backpressure through pq/TCP and
+* **explicit** rejection of requests + retries with exponential backoff
+
+Further, there is the question of how throttling GetPage@LSN will affect the **internal GetPage latency SLO**:
+where do we measure the SLI for Pageserver's internal getpage latency SLO? Before or after the throttling?
+
+And when we eventually move the measurement point into the Computes (to avoid coordinated omission),
+how do we avoid counting throttling-induced latency toward the internal getpage latency SLI/SLO?
+
+## Scope Of This RFC
+
+**This RFC proposes introducing a hard GetPage@LSN/second limit per tenant, with the same value applying to each tenant on a Pageserver**.
+
+This proposal is easy to implement and significantly de-risks operating large Pageservers,
+based on the assumption that extremely-high-GetPage-rate-episodes like the one from the "Motivation" section are uncorrelated between tenants.
+
+For example, suppose we pick a limit that allows up to 10 tenants to go at limit rate.
+Suppose our Pageserver can serve 100k GetPage/second total at a 100% page cache miss rate.
+If each tenant gets a hard limit of 10k GetPage/second, we can serve up to 10 tenants at limit speed without latency degradation.
+
+The mechanism for backpressure will be TCP-based implicit backpressure.
+The compute team isn't concerned about prefetch queue depth.
+Pageserver will implement it by delaying the reading of requests from the libpq connection(s).
+
+The rate limit will be implemented using a per-tenant token bucket.
+The bucket will be be shared among all connections to the tenant.
+The bucket implementation supports starvation-preventing `await`ing.
+The current candidate for the implementation is [`leaky_bucket`](https://docs.rs/leaky-bucket/).
+The getpage@lsn benchmark that's being added in https://github.com/neondatabase/neon/issues/5771
+can be used to evaluate the overhead of sharing the bucket among connections of a tenant.
+A possible technique to mitigate the impact of sharing the bucket would be to maintain a buffer of a few tokens per connection handler.
+
+Regarding metrics / the internal GetPage latency SLO:
+we will measure the GetPage latency SLO _after_ the throttler and introduce a new metric to measure the amount of throttling, quantified by:
+- histogram that records the tenants' observations of queue depth before they start waiting (one such histogram per pageserver)
+- histogram that records the tenants' observations of time spent waiting (one such histogram per pageserver)
+
+Further observability measures:
+- an INFO log message at frequency 1/min if the tenant/timeline/connection was throttled in that last minute.
+  The message will identify the tenant/timeline/connection to allow correlation with compute logs/stats.
+
+Rollout will happen as follows:
+- deploy 1: implementation + config: disabled by default, ability to enable it per tenant through tenant_conf
+- experimentation in staging and later production to study impact & interaction with auto-scaling
+- determination of a sensible global default value
+  - the value will be chosen as high as possible ...
+  - ... but low enough to work towards this RFC's goal that one tenant should not be able to dominate a pageserver instance.
+- deploy 2: implementation fixes if any + config: enabled by default with the aforementioned global default
+- reset of the experimental per-tenant overrides
+- gain experience & lower the limit over time
+  - we stop lowering the limit as soon as this RFC's goal is achieved, i.e.,
+    once we decide that in practice the chosen value sufficiently de-risks operating large pageservers
+
+The per-tenant override will remain for emergencies and testing.
+But since Console doesn't preserve it during tenant migrations, it isn't durably configurable for the tenant.
+
+Toward the upper layers of the Neon stack, the resulting limit will be
+**"the highest GetPage/second that Pageserver can support for a single tenant"**.
+
+### Rationale
+
+We decided against error + retry because of worries about starvation.
+
+## Future Work
+
+Enable per-tenant emergency override of the limit via Console.
+Should be part of a more general framework to specify tenant config overrides.
+**NB:** this is **not** the right mechanism to _sell_ different max GetPage/second levels to users,
+or _auto-scale_ the GetPage/second levels. Such functionality will require a separate RFC that
+concerns itself with GetPage/second capacity planning.
+
+Compute-side metrics for GetPage latency.
+
+Back-channel to inform Compute/Autoscaling/ControlPlane that the project is being throttled.
+
+Compute-side neon_smgr improvements to avoid sending the same GetPage request multiple times if multiple backends experience a cache miss.
+
+Dealing with read-only endpoints: users use read-only endpoints to scale reads for a single tenant.
+Possibly there are also assumptions around read-only endpoints not affecting the primary read-write endpoint's performance.
+With per-tenant rate limiting, we will not meet that expectation.
+However, we can currently only scale per tenant.
+Soon, we will have sharding (#5505), which will apply the throttling on a per-shard basis.
+But, that's orthogonal to scaling reads: if many endpoints hit one shard, they share the same throttling limit.
+To solve this properly, I think we'll need replicas for tenants / shard.
+To performance-isolate a tenant's endpoints from each other, we'd then route them to different replicas.
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 2a483188e4..4ff6831272 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -73,6 +73,8 @@ pub struct ComputeSpec {
 
     // information about available remote extensions
     pub remote_extensions: Option<RemoteExtSpec>,
+
+    pub pgbouncer_settings: Option<HashMap<String, String>>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json
index e2afa17ef0..ccd015ad19 100644
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -243,5 +243,9 @@
         "public_extensions": [
           "postgis"
         ]
+      },
+      "pgbouncer_settings": {
+        "default_pool_size": "42",
+        "pool_mode": "session"
       }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index a78ba8ad94..be41b610b8 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,6 +2,7 @@ pub mod partitioning;
 
 use std::{
     collections::HashMap,
+    io::Read,
     num::{NonZeroU64, NonZeroUsize},
     time::SystemTime,
 };
@@ -19,7 +20,7 @@ use utils::{
 
 use crate::{reltag::RelTag, shard::TenantShardId};
 use anyhow::bail;
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 /// The state of a tenant in this pageserver.
 ///
@@ -369,6 +370,14 @@ pub struct TenantInfo {
     pub attachment_status: TenantAttachmentStatus,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TenantDetails {
+    #[serde(flatten)]
+    pub tenant_info: TenantInfo,
+
+    pub timelines: Vec<TimelineId>,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -576,6 +585,7 @@ pub enum PagestreamFeMessage {
 }
 
 // Wrapped in libpq CopyData
+#[derive(strum_macros::EnumProperty)]
 pub enum PagestreamBeMessage {
     Exists(PagestreamExistsResponse),
     Nblocks(PagestreamNblocksResponse),
@@ -584,6 +594,29 @@ pub enum PagestreamBeMessage {
     DbSize(PagestreamDbSizeResponse),
 }
 
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamBeMessageTag {
+    Exists = 100,
+    Nblocks = 101,
+    GetPage = 102,
+    Error = 103,
+    DbSize = 104,
+}
+impl TryFrom<u8> for PagestreamBeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            100 => Ok(PagestreamBeMessageTag::Exists),
+            101 => Ok(PagestreamBeMessageTag::Nblocks),
+            102 => Ok(PagestreamBeMessageTag::GetPage),
+            103 => Ok(PagestreamBeMessageTag::Error),
+            104 => Ok(PagestreamBeMessageTag::DbSize),
+            _ => Err(value),
+        }
+    }
+}
+
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
     pub latest: bool,
@@ -739,35 +772,91 @@ impl PagestreamBeMessage {
     pub fn serialize(&self) -> Bytes {
         let mut bytes = BytesMut::new();
 
+        use PagestreamBeMessageTag as Tag;
         match self {
             Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Exists as u8);
                 bytes.put_u8(resp.exists as u8);
             }
 
             Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Nblocks as u8);
                 bytes.put_u32(resp.n_blocks);
             }
 
             Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::GetPage as u8);
                 bytes.put(&resp.page[..]);
             }
 
             Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::Error as u8);
                 bytes.put(resp.message.as_bytes());
                 bytes.put_u8(0); // null terminator
             }
             Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_u8(Tag::DbSize as u8);
                 bytes.put_i64(resp.db_size);
             }
         }
 
         bytes.into()
     }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+
+        use PagestreamBeMessageTag as Tag;
+        let ok =
+            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
+                Tag::Exists => {
+                    let exists = buf.read_u8()?;
+                    Self::Exists(PagestreamExistsResponse {
+                        exists: exists != 0,
+                    })
+                }
+                Tag::Nblocks => {
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    Self::Nblocks(PagestreamNblocksResponse { n_blocks })
+                }
+                Tag::GetPage => {
+                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
+                    buf.read_exact(&mut page)?;
+                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
+                }
+                Tag::Error => {
+                    let buf = buf.get_ref();
+                    let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
+                    let rust_str = cstr.to_str()?;
+                    PagestreamBeMessage::Error(PagestreamErrorResponse {
+                        message: rust_str.to_owned(),
+                    })
+                }
+                Tag::DbSize => {
+                    let db_size = buf.read_i64::<BigEndian>()?;
+                    Self::DbSize(PagestreamDbSizeResponse { db_size })
+                }
+            };
+        let remaining = buf.into_inner();
+        if !remaining.is_empty() {
+            anyhow::bail!(
+                "remaining bytes in msg with tag={msg_tag}: {}",
+                remaining.len()
+            );
+        }
+        Ok(ok)
+    }
+
+    pub fn kind(&self) -> &'static str {
+        match self {
+            Self::Exists(_) => "Exists",
+            Self::Nblocks(_) => "Nblocks",
+            Self::GetPage(_) => "GetPage",
+            Self::Error(_) => "Error",
+            Self::DbSize(_) => "DbSize",
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 052fbd1402..3e4936eec4 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -81,6 +81,10 @@ impl TenantShardId {
     pub fn is_zero(&self) -> bool {
         self.shard_number == ShardNumber(0)
     }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
 }
 
 /// Formatting helper
@@ -159,7 +163,7 @@ impl From<[u8; 18]> for TenantShardId {
 /// shard we're dealing with, but do not need to know the full ShardIdentity (because
 /// we won't be doing any page->shard mapping), and do not need to know the fully qualified
 /// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
     pub shard_number: ShardNumber,
     pub shard_count: ShardCount,
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 548bde02f6..7ea1103eb2 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -117,6 +117,8 @@ impl AzureBlobStorage {
     ) -> Result<Download, DownloadError> {
         let mut response = builder.into_stream();
 
+        let mut etag = None;
+        let mut last_modified = None;
         let mut metadata = HashMap::new();
         // TODO give proper streaming response instead of buffering into RAM
         // https://github.com/neondatabase/neon/issues/5563
@@ -124,6 +126,13 @@ impl AzureBlobStorage {
         let mut bufs = Vec::new();
         while let Some(part) = response.next().await {
             let part = part.map_err(to_download_error)?;
+            let etag_str: &str = part.blob.properties.etag.as_ref();
+            if etag.is_none() {
+                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+            }
+            if last_modified.is_none() {
+                last_modified = Some(part.blob.properties.last_modified.into());
+            }
             if let Some(blob_meta) = part.blob.metadata {
                 metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
             }
@@ -136,6 +145,8 @@ impl AzureBlobStorage {
         }
         Ok(Download {
             download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+            etag,
+            last_modified,
             metadata: Some(StorageMetadata(metadata)),
         })
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e77c54e1e7..3e408e3119 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -14,7 +14,9 @@ mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
 
-use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
+use std::{
+    collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
+};
 
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -207,8 +209,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
 }
 
+pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
 pub struct Download {
-    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
+    pub download_stream: DownloadStream,
+    /// The last time the file was modified (`last-modified` HTTP header)
+    pub last_modified: Option<SystemTime>,
+    /// A way to identify this specific version of the resource (`etag` HTTP header)
+    pub etag: Option<String>,
     /// Extra key-value data, associated with the current remote file.
     pub metadata: Option<StorageMetadata>,
 }
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 03b98e5ea2..d1e7d325b9 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,7 @@ use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
+use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
 
 use super::{RemoteStorage, StorageMetadata};
 
@@ -331,6 +331,8 @@ impl RemoteStorage for LocalFs {
                 .map_err(DownloadError::Other)?;
             Ok(Download {
                 metadata,
+                last_modified: None,
+                etag: None,
                 download_stream: Box::pin(source),
             })
         } else {
@@ -372,17 +374,17 @@ impl RemoteStorage for LocalFs {
                 .await
                 .map_err(DownloadError::Other)?;
 
-            Ok(match end_exclusive {
-                Some(end_exclusive) => Download {
-                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(
-                        source.take(end_exclusive - start_inclusive),
-                    )),
-                },
-                None => Download {
-                    metadata,
-                    download_stream: Box::pin(ReaderStream::new(source)),
-                },
+            let download_stream: DownloadStream = match end_exclusive {
+                Some(end_exclusive) => Box::pin(ReaderStream::new(
+                    source.take(end_exclusive - start_inclusive),
+                )),
+                None => Box::pin(ReaderStream::new(source)),
+            };
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream,
             })
         } else {
             Err(DownloadError::NotFound)
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 97fa1bbf5b..0f95458ad1 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -16,6 +16,7 @@ use aws_config::{
     environment::credentials::EnvironmentVariableCredentialsProvider,
     imds::credentials::ImdsCredentialsProvider,
     meta::credentials::CredentialsProviderChain,
+    profile::ProfileFileCredentialsProvider,
     provider_config::ProviderConfig,
     retry::{RetryConfigBuilder, RetryMode},
     web_identity_token::WebIdentityTokenCredentialsProvider,
@@ -74,20 +75,29 @@ impl S3Bucket {
 
         let region = Some(Region::new(aws_config.bucket_region.clone()));
 
+        let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
         let credentials_provider = {
             // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
             CredentialsProviderChain::first_try(
                 "env",
                 EnvironmentVariableCredentialsProvider::new(),
             )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
             // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
             // needed to access remote extensions bucket
-            .or_else("token", {
-                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+            .or_else(
+                "token",
                 WebIdentityTokenCredentialsProvider::builder()
                     .configure(&provider_conf)
-                    .build()
-            })
+                    .build(),
+            )
             // uses imds v2
             .or_else("imds", ImdsCredentialsProvider::builder().build())
         };
@@ -218,17 +228,11 @@ impl S3Bucket {
 
         let started_at = ScopeGuard::into_inner(started_at);
 
-        if get_object.is_err() {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                kind,
-                AttemptOutcome::Err,
-                started_at,
-            );
-        }
-
         match get_object {
             Ok(object_output) => {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
+                let etag = object_output.e_tag.clone();
+                let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
 
                 let body = object_output.body;
                 let body = ByteStreamAsStream::from(body);
@@ -237,15 +241,33 @@ impl S3Bucket {
 
                 Ok(Download {
                     metadata,
+                    etag,
+                    last_modified,
                     download_stream: Box::pin(body),
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
+                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
+                // an error: we expect to sometimes fetch an object and find it missing,
+                // e.g. when probing for timeline indices.
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                 Err(DownloadError::NotFound)
             }
-            Err(e) => Err(DownloadError::Other(
-                anyhow::Error::new(e).context("download s3 object"),
-            )),
+            Err(e) => {
+                metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );
+
+                Err(DownloadError::Other(
+                    anyhow::Error::new(e).context("download s3 object"),
+                ))
+            }
         }
     }
 }
diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs
new file mode 100644
index 0000000000..bca117ed1a
--- /dev/null
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -0,0 +1,200 @@
+use std::collections::HashSet;
+use std::ops::ControlFlow;
+use std::path::PathBuf;
+use std::sync::Arc;
+
+use anyhow::Context;
+use bytes::Bytes;
+use camino::Utf8Path;
+use futures::stream::Stream;
+use once_cell::sync::OnceCell;
+use remote_storage::{Download, GenericRemoteStorage, RemotePath};
+use tokio::task::JoinSet;
+use tracing::{debug, error, info};
+
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
+pub(crate) fn upload_stream(
+    content: std::borrow::Cow<'static, [u8]>,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    use std::borrow::Cow;
+
+    let content = match content {
+        Cow::Borrowed(x) => Bytes::from_static(x),
+        Cow::Owned(vec) => Bytes::from(vec),
+    };
+    wrap_stream(content)
+}
+
+pub(crate) fn wrap_stream(
+    content: bytes::Bytes,
+) -> (
+    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
+    usize,
+) {
+    let len = content.len();
+    let content = futures::future::ready(Ok(content));
+
+    (futures::stream::once(content), len)
+}
+
+pub(crate) async fn download_to_vec(dl: Download) -> anyhow::Result<Vec<u8>> {
+    let mut buf = Vec::new();
+    tokio::io::copy_buf(
+        &mut tokio_util::io::StreamReader::new(dl.download_stream),
+        &mut buf,
+    )
+    .await?;
+    Ok(buf)
+}
+
+// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
+pub(crate) async fn upload_simple_remote_data(
+    client: &Arc<GenericRemoteStorage>,
+    upload_tasks_count: usize,
+) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
+    info!("Creating {upload_tasks_count} remote files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>(blob_path)
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok(upload_path) => {
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    if upload_tasks_failed {
+        ControlFlow::Break(uploaded_blobs)
+    } else {
+        ControlFlow::Continue(uploaded_blobs)
+    }
+}
+
+pub(crate) async fn cleanup(
+    client: &Arc<GenericRemoteStorage>,
+    objects_to_delete: HashSet<RemotePath>,
+) {
+    info!(
+        "Removing {} objects from the remote storage during cleanup",
+        objects_to_delete.len()
+    );
+    let mut delete_tasks = JoinSet::new();
+    for object_to_delete in objects_to_delete {
+        let task_client = Arc::clone(client);
+        delete_tasks.spawn(async move {
+            debug!("Deleting remote item at path {object_to_delete:?}");
+            task_client
+                .delete(&object_to_delete)
+                .await
+                .with_context(|| format!("{object_to_delete:?} removal"))
+        });
+    }
+
+    while let Some(task_run_result) = delete_tasks.join_next().await {
+        match task_run_result {
+            Ok(task_result) => match task_result {
+                Ok(()) => {}
+                Err(e) => error!("Delete task failed: {e:?}"),
+            },
+            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
+        }
+    }
+}
+pub(crate) struct Uploads {
+    pub(crate) prefixes: HashSet<RemotePath>,
+    pub(crate) blobs: HashSet<RemotePath>,
+}
+
+pub(crate) async fn upload_remote_data(
+    client: &Arc<GenericRemoteStorage>,
+    base_prefix_str: &'static str,
+    upload_tasks_count: usize,
+) -> ControlFlow<Uploads, Uploads> {
+    info!("Creating {upload_tasks_count} remote files");
+    let mut upload_tasks = JoinSet::new();
+    for i in 1..upload_tasks_count + 1 {
+        let task_client = Arc::clone(client);
+        upload_tasks.spawn(async move {
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
+                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
+            debug!("Creating remote item {i} at path {blob_path:?}");
+
+            let (data, data_len) =
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
+            task_client.upload(data, data_len, &blob_path, None).await?;
+
+            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
+        });
+    }
+
+    let mut upload_tasks_failed = false;
+    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
+    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
+    while let Some(task_run_result) = upload_tasks.join_next().await {
+        match task_run_result
+            .context("task join failed")
+            .and_then(|task_result| task_result.context("upload task failed"))
+        {
+            Ok((upload_prefix, upload_path)) => {
+                uploaded_prefixes.insert(upload_prefix);
+                uploaded_blobs.insert(upload_path);
+            }
+            Err(e) => {
+                error!("Upload task failed: {e:?}");
+                upload_tasks_failed = true;
+            }
+        }
+    }
+
+    let uploads = Uploads {
+        prefixes: uploaded_prefixes,
+        blobs: uploaded_blobs,
+    };
+    if upload_tasks_failed {
+        ControlFlow::Break(uploads)
+    } else {
+        ControlFlow::Continue(uploads)
+    }
+}
+
+pub(crate) fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+            utils::logging::Output::Stdout,
+        )
+        .expect("logging init failed");
+    });
+}
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 7327803198..0387dc30e7 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
-use once_cell::sync::OnceCell;
 use remote_storage::{
-    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
 use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
+use tracing::{debug, info};
 
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+mod common;
+
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};
 
 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
 
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
+/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
 /// where
 /// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -97,7 +97,7 @@ async fn azure_pagination_should_work(
 /// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `Azure_pagination_should_work` for more information.
 ///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
+/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -218,18 +218,9 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
 
     ctx.client.upload(data, len, &path, None).await?;
 
-    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
-        let mut buf = Vec::new();
-        tokio::io::copy_buf(
-            &mut tokio_util::io::StreamReader::new(dl.download_stream),
-            &mut buf,
-        )
-        .await?;
-        Ok(buf)
-    }
     // Normal download request
     let dl = ctx.client.download(&path).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // Full range (end specified)
@@ -237,12 +228,12 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
         .client
         .download_byte_range(&path, 0, Some(len as u64))
         .await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     // partial range (end specified)
     let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..10]);
 
     // partial range (end beyond real end)
@@ -250,17 +241,17 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
         .client
         .download_byte_range(&path, 8, Some(len as u64 * 100))
         .await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[8..]);
 
     // Partial range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..]);
 
     // Full range (end unspecified)
     let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_and_compare(dl).await?;
+    let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
 
     debug!("Cleanup: deleting file at path {path:?}");
@@ -272,17 +263,6 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
     Ok(())
 }
 
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-            utils::logging::Output::Stdout,
-        )
-        .expect("logging init failed");
-    });
-}
-
 struct EnabledAzure {
     client: Arc<GenericRemoteStorage>,
     base_prefix: &'static str,
@@ -352,7 +332,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
 
         let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -414,7 +394,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
 
         let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -478,166 +458,3 @@ fn create_azure_client(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_azure_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} Azure files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
-
-// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
-// to binary
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index ecd834e61c..8f46b2abd6 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -2,23 +2,23 @@ use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
-use bytes::Bytes;
 use camino::Utf8Path;
-use futures::stream::Stream;
-use once_cell::sync::OnceCell;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
 use test_context::{test_context, AsyncTestContext};
-use tokio::task::JoinSet;
-use tracing::{debug, error, info};
+use tracing::{debug, info};
 
-static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+mod common;
+
+use common::{
+    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
+    upload_stream, wrap_stream,
+};
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
@@ -30,7 +30,7 @@ const BASE_PREFIX: &str = "test";
 /// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
 /// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
 ///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
 /// where
 /// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
 /// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
@@ -95,7 +95,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
 /// See `s3_pagination_should_work` for more information.
 ///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
 ///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
 ///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
@@ -198,15 +198,65 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
     Ok(())
 }
 
-fn ensure_logging_ready() {
-    LOGGING_DONE.get_or_init(|| {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-            utils::logging::Output::Stdout,
-        )
-        .expect("logging init failed");
-    });
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let MaybeEnabledS3::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
 }
 
 struct EnabledS3 {
@@ -278,7 +328,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
 
         let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
+        match upload_remote_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -340,7 +390,7 @@ impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
 
         let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
 
-        match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
+        match upload_simple_remote_data(&enabled.client, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
 
@@ -403,166 +453,3 @@ fn create_s3_client(
         GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
     ))
 }
-
-struct Uploads {
-    prefixes: HashSet<RemotePath>,
-    blobs: HashSet<RemotePath>,
-}
-
-async fn upload_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
-    upload_tasks_count: usize,
-) -> ControlFlow<Uploads, Uploads> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
-            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
-                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok((upload_prefix, upload_path)) => {
-                uploaded_prefixes.insert(upload_prefix);
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    let uploads = Uploads {
-        prefixes: uploaded_prefixes,
-        blobs: uploaded_blobs,
-    };
-    if upload_tasks_failed {
-        ControlFlow::Break(uploads)
-    } else {
-        ControlFlow::Continue(uploads)
-    }
-}
-
-async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
-    info!(
-        "Removing {} objects from the remote storage during cleanup",
-        objects_to_delete.len()
-    );
-    let mut delete_tasks = JoinSet::new();
-    for object_to_delete in objects_to_delete {
-        let task_client = Arc::clone(client);
-        delete_tasks.spawn(async move {
-            debug!("Deleting remote item at path {object_to_delete:?}");
-            task_client
-                .delete(&object_to_delete)
-                .await
-                .with_context(|| format!("{object_to_delete:?} removal"))
-        });
-    }
-
-    while let Some(task_run_result) = delete_tasks.join_next().await {
-        match task_run_result {
-            Ok(task_result) => match task_result {
-                Ok(()) => {}
-                Err(e) => error!("Delete task failed: {e:?}"),
-            },
-            Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
-        }
-    }
-}
-
-// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
-async fn upload_simple_s3_data(
-    client: &Arc<GenericRemoteStorage>,
-    upload_tasks_count: usize,
-) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
-    info!("Creating {upload_tasks_count} S3 files");
-    let mut upload_tasks = JoinSet::new();
-    for i in 1..upload_tasks_count + 1 {
-        let task_client = Arc::clone(client);
-        upload_tasks.spawn(async move {
-            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(
-                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
-            )
-            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
-            debug!("Creating remote item {i} at path {blob_path:?}");
-
-            let (data, data_len) =
-                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
-
-            Ok::<_, anyhow::Error>(blob_path)
-        });
-    }
-
-    let mut upload_tasks_failed = false;
-    let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
-    while let Some(task_run_result) = upload_tasks.join_next().await {
-        match task_run_result
-            .context("task join failed")
-            .and_then(|task_result| task_result.context("upload task failed"))
-        {
-            Ok(upload_path) => {
-                uploaded_blobs.insert(upload_path);
-            }
-            Err(e) => {
-                error!("Upload task failed: {e:?}");
-                upload_tasks_failed = true;
-            }
-        }
-    }
-
-    if upload_tasks_failed {
-        ControlFlow::Break(uploaded_blobs)
-    } else {
-        ControlFlow::Continue(uploaded_blobs)
-    }
-}
-
-fn upload_stream(
-    content: std::borrow::Cow<'static, [u8]>,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    use std::borrow::Cow;
-
-    let content = match content {
-        Cow::Borrowed(x) => Bytes::from_static(x),
-        Cow::Owned(vec) => Bytes::from(vec),
-    };
-    wrap_stream(content)
-}
-
-fn wrap_stream(
-    content: bytes::Bytes,
-) -> (
-    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
-    usize,
-) {
-    let len = content.len();
-    let content = futures::future::ready(Ok(content));
-
-    (futures::stream::once(content), len)
-}
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 262dcb8a8a..b3269ae049 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -366,6 +366,49 @@ impl MonotonicCounter<Lsn> for RecordLsn {
     }
 }
 
+/// Implements  [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
+///
+/// This is used by the `pagebench` pageserver benchmarking tool.
+pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
+
+impl rand::distributions::uniform::SampleUniform for Lsn {
+    type Sampler = LsnSampler;
+}
+
+impl rand::distributions::uniform::UniformSampler for LsnSampler {
+    type X = Lsn;
+
+    fn new<B1, B2>(low: B1, high: B2) -> Self
+    where
+        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Self(
+            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
+                low.borrow().0,
+                high.borrow().0,
+            ),
+        )
+    }
+
+    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
+    where
+        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+    {
+        Self(
+            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
+                low.borrow().0,
+                high.borrow().0,
+            ),
+        )
+    }
+
+    fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
+        Lsn(self.0.sample(rng))
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::bin_ser::BeSer;
diff --git a/libs/walproposer/bindgen_deps.h b/libs/walproposer/bindgen_deps.h
index b95788347c..41ee1cd4a3 100644
--- a/libs/walproposer/bindgen_deps.h
+++ b/libs/walproposer/bindgen_deps.h
@@ -1 +1,2 @@
+#include "postgres.h"
 #include "walproposer.h"
diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 77afe1e686..1f7bf952dc 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -8,12 +8,12 @@ use std::ffi::CString;
 
 use crate::bindings::uint32;
 use crate::bindings::walproposer_api;
+use crate::bindings::NeonWALReadResult;
 use crate::bindings::PGAsyncReadResult;
 use crate::bindings::PGAsyncWriteResult;
 use crate::bindings::Safekeeper;
 use crate::bindings::Size;
 use crate::bindings::StringInfoData;
-use crate::bindings::TimeLineID;
 use crate::bindings::TimestampTz;
 use crate::bindings::WalProposer;
 use crate::bindings::WalProposerConnStatusType;
@@ -178,31 +178,11 @@ extern "C" fn conn_blocking_write(
     }
 }
 
-extern "C" fn recovery_download(
-    sk: *mut Safekeeper,
-    _timeline: TimeLineID,
-    startpos: XLogRecPtr,
-    endpos: XLogRecPtr,
-) -> bool {
+extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bool {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).recovery_download(&mut (*sk), startpos, endpos)
-    }
-}
-
-#[allow(clippy::unnecessary_cast)]
-extern "C" fn wal_read(
-    sk: *mut Safekeeper,
-    buf: *mut ::std::os::raw::c_char,
-    startptr: XLogRecPtr,
-    count: Size,
-) {
-    unsafe {
-        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
-        let callback_data = (*(*(*sk).wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).wal_read(&mut (*sk), buf, startptr)
+        (*api).recovery_download(&mut (*wp), &mut (*sk))
     }
 }
 
@@ -214,11 +194,28 @@ extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
     }
 }
 
-extern "C" fn free_event_set(wp: *mut WalProposer) {
+#[allow(clippy::unnecessary_cast)]
+extern "C" fn wal_read(
+    sk: *mut Safekeeper,
+    buf: *mut ::std::os::raw::c_char,
+    startptr: XLogRecPtr,
+    count: Size,
+    _errmsg: *mut *mut ::std::os::raw::c_char,
+) -> NeonWALReadResult {
     unsafe {
-        let callback_data = (*(*wp).config).callback_data;
+        let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
+        let callback_data = (*(*(*sk).wp).config).callback_data;
         let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).free_event_set(&mut (*wp));
+        // TODO: errmsg is not forwarded
+        (*api).wal_read(&mut (*sk), buf, startptr)
+    }
+}
+
+extern "C" fn wal_reader_events(sk: *mut Safekeeper) -> uint32 {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).wal_reader_events(&mut (*sk))
     }
 }
 
@@ -238,6 +235,14 @@ extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
     }
 }
 
+extern "C" fn active_state_update_event_set(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).active_state_update_event_set(&mut (*sk));
+    }
+}
+
 extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
     unsafe {
         let callback_data = (*(*(*sk).wp).config).callback_data;
@@ -246,6 +251,14 @@ extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
     }
 }
 
+extern "C" fn rm_safekeeper_event_set(sk: *mut Safekeeper) {
+    unsafe {
+        let callback_data = (*(*(*sk).wp).config).callback_data;
+        let api = callback_data as *mut Box<dyn ApiImpl>;
+        (*api).rm_safekeeper_event_set(&mut (*sk));
+    }
+}
+
 extern "C" fn wait_event_set(
     wp: *mut WalProposer,
     timeout: ::std::os::raw::c_long,
@@ -313,14 +326,6 @@ extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLog
     }
 }
 
-extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).confirm_wal_streamed(&mut (*wp), lsn)
-    }
-}
-
 extern "C" fn log_internal(
     wp: *mut WalProposer,
     level: ::std::os::raw::c_int,
@@ -335,14 +340,6 @@ extern "C" fn log_internal(
     }
 }
 
-extern "C" fn after_election(wp: *mut WalProposer) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).after_election(&mut (*wp))
-    }
-}
-
 #[derive(Debug)]
 pub enum Level {
     Debug5,
@@ -401,20 +398,20 @@ pub(crate) fn create_api() -> walproposer_api {
         conn_async_write: Some(conn_async_write),
         conn_blocking_write: Some(conn_blocking_write),
         recovery_download: Some(recovery_download),
-        wal_read: Some(wal_read),
         wal_reader_allocate: Some(wal_reader_allocate),
-        free_event_set: Some(free_event_set),
+        wal_read: Some(wal_read),
+        wal_reader_events: Some(wal_reader_events),
         init_event_set: Some(init_event_set),
         update_event_set: Some(update_event_set),
+        active_state_update_event_set: Some(active_state_update_event_set),
         add_safekeeper_event_set: Some(add_safekeeper_event_set),
+        rm_safekeeper_event_set: Some(rm_safekeeper_event_set),
         wait_event_set: Some(wait_event_set),
         strong_random: Some(strong_random),
         get_redo_start_lsn: Some(get_redo_start_lsn),
         finish_sync_safekeepers: Some(finish_sync_safekeepers),
         process_safekeeper_feedback: Some(process_safekeeper_feedback),
-        confirm_wal_streamed: Some(confirm_wal_streamed),
         log_internal: Some(log_internal),
-        after_election: Some(after_election),
     }
 }
 
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index f5723018d7..35c8f6904d 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -6,8 +6,8 @@ use utils::id::TenantTimelineId;
 use crate::{
     api_bindings::{create_api, take_vec_u8, Level},
     bindings::{
-        Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
-        WalProposerStart,
+        NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
+        WalProposerFree, WalProposerStart,
     },
 };
 
@@ -86,19 +86,19 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
+    fn recovery_download(&self, _wp: &mut WalProposer, _sk: &mut Safekeeper) -> bool {
         todo!()
     }
 
-    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
+    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) -> NeonWALReadResult {
         todo!()
     }
 
-    fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
+    fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) -> NeonWALReadResult {
         todo!()
     }
 
-    fn free_event_set(&self, _wp: &mut WalProposer) {
+    fn wal_reader_events(&self, _sk: &mut Safekeeper) -> u32 {
         todo!()
     }
 
@@ -110,10 +110,18 @@ pub trait ApiImpl {
         todo!()
     }
 
+    fn active_state_update_event_set(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
     fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
         todo!()
     }
 
+    fn rm_safekeeper_event_set(&self, _sk: &mut Safekeeper) {
+        todo!()
+    }
+
     fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
         todo!()
     }
@@ -134,10 +142,6 @@ pub trait ApiImpl {
         todo!()
     }
 
-    fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
-        todo!()
-    }
-
     fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
         todo!()
     }
@@ -240,6 +244,7 @@ impl Drop for Wrapper {
 
 #[cfg(test)]
 mod tests {
+    use core::panic;
     use std::{
         cell::Cell,
         sync::{atomic::AtomicUsize, mpsc::sync_channel},
@@ -247,7 +252,7 @@ mod tests {
 
     use utils::id::TenantTimelineId;
 
-    use crate::{api_bindings::Level, walproposer::Wrapper};
+    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
 
     use super::ApiImpl;
 
@@ -355,12 +360,17 @@ mod tests {
             true
         }
 
-        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
-            println!("wal_reader_allocate")
+        fn recovery_download(
+            &self,
+            _wp: &mut crate::bindings::WalProposer,
+            _sk: &mut crate::bindings::Safekeeper,
+        ) -> bool {
+            true
         }
 
-        fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
-            println!("free_event_set")
+        fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) -> NeonWALReadResult {
+            println!("wal_reader_allocate");
+            crate::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS
         }
 
         fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
@@ -383,6 +393,13 @@ mod tests {
             self.wait_events.set(WaitEventsData { sk, event_mask });
         }
 
+        fn rm_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper) {
+            println!(
+                "rm_safekeeper_event_set, sk={:?}",
+                sk as *mut crate::bindings::Safekeeper
+            );
+        }
+
         fn wait_event_set(
             &self,
             _: &mut crate::bindings::WalProposer,
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9e8172c6a1..980fbab22e 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -63,6 +63,7 @@ thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
+tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index 4bd36185a6..0ed27602cd 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -12,3 +12,11 @@ reqwest.workspace = true
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+tokio-postgres.workspace = true
+tokio-stream.workspace = true
+tokio.workspace = true
+futures.workspace = true
+tokio-util.workspace = true
+anyhow.workspace = true
+postgres.workspace = true
+bytes.workspace = true
diff --git a/pageserver/client/src/lib.rs b/pageserver/client/src/lib.rs
index 3963fd466c..4a3f4dea47 100644
--- a/pageserver/client/src/lib.rs
+++ b/pageserver/client/src/lib.rs
@@ -1 +1,2 @@
 pub mod mgmt_api;
+pub mod page_service;
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 77eb1bb8e2..87e4ed8efd 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -5,6 +5,8 @@ use utils::{
     id::{TenantId, TimelineId},
 };
 
+pub mod util;
+
 #[derive(Debug)]
 pub struct Client {
     mgmt_api_endpoint: String,
@@ -64,6 +66,18 @@ impl Client {
         resp.json().await.map_err(Error::ReceiveBody)
     }
 
+    pub async fn tenant_details(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<pageserver_api::models::TenantDetails> {
+        let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
+        self.get(uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn list_timelines(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/client/src/mgmt_api/util.rs b/pageserver/client/src/mgmt_api/util.rs
new file mode 100644
index 0000000000..048a3bb7cd
--- /dev/null
+++ b/pageserver/client/src/mgmt_api/util.rs
@@ -0,0 +1,49 @@
+//! Helpers to do common higher-level tasks with the [`Client`].
+
+use std::sync::Arc;
+
+use tokio::task::JoinSet;
+use utils::id::{TenantId, TenantTimelineId};
+
+use super::Client;
+
+/// Retrieve a list of all of the pageserver's timelines.
+///
+/// Fails if there are sharded tenants present on the pageserver.
+pub async fn get_pageserver_tenant_timelines_unsharded(
+    api_client: &Arc<Client>,
+) -> anyhow::Result<Vec<TenantTimelineId>> {
+    let mut timelines: Vec<TenantTimelineId> = Vec::new();
+    let mut tenants: Vec<TenantId> = Vec::new();
+    for ti in api_client.list_tenants().await? {
+        if !ti.id.is_unsharded() {
+            anyhow::bail!(
+                "only unsharded tenants are supported at this time: {}",
+                ti.id
+            );
+        }
+        tenants.push(ti.id.tenant_id)
+    }
+    let mut js = JoinSet::new();
+    for tenant_id in tenants {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(api_client);
+            async move {
+                (
+                    tenant_id,
+                    mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
+                )
+            }
+        });
+    }
+    while let Some(res) = js.join_next().await {
+        let (tenant_id, details) = res.unwrap();
+        for timeline_id in details.timelines {
+            timelines.push(TenantTimelineId {
+                tenant_id,
+                timeline_id,
+            });
+        }
+    }
+    Ok(timelines)
+}
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
new file mode 100644
index 0000000000..fc0d2311f7
--- /dev/null
+++ b/pageserver/client/src/page_service.rs
@@ -0,0 +1,151 @@
+use std::pin::Pin;
+
+use futures::SinkExt;
+use pageserver_api::{
+    models::{
+        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
+        PagestreamGetPageResponse,
+    },
+    reltag::RelTag,
+};
+use tokio::task::JoinHandle;
+use tokio_postgres::CopyOutStream;
+use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+pub struct Client {
+    client: tokio_postgres::Client,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct BasebackupRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub lsn: Option<Lsn>,
+    pub gzip: bool,
+}
+
+impl Client {
+    pub async fn new(connstring: String) -> anyhow::Result<Self> {
+        let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
+
+        let conn_task_cancel = CancellationToken::new();
+        let conn_task = tokio::spawn({
+            let conn_task_cancel = conn_task_cancel.clone();
+            async move {
+                tokio::select! {
+                    _ = conn_task_cancel.cancelled() => { }
+                    res = connection => {
+                        res.unwrap();
+                    }
+                }
+            }
+        });
+        Ok(Self {
+            cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
+            conn_task,
+            client,
+        })
+    }
+
+    pub async fn pagestream(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<PagestreamClient> {
+        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
+            .client
+            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+            .await?;
+        let Client {
+            cancel_on_client_drop,
+            conn_task,
+            client: _,
+        } = self;
+        Ok(PagestreamClient {
+            copy_both: Box::pin(copy_both),
+            conn_task,
+            cancel_on_client_drop,
+        })
+    }
+
+    pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result<CopyOutStream> {
+        let BasebackupRequest {
+            tenant_id,
+            timeline_id,
+            lsn,
+            gzip,
+        } = req;
+        let mut args = Vec::with_capacity(5);
+        args.push("basebackup".to_string());
+        args.push(format!("{tenant_id}"));
+        args.push(format!("{timeline_id}"));
+        if let Some(lsn) = lsn {
+            args.push(format!("{lsn}"));
+        }
+        if *gzip {
+            args.push("--gzip".to_string())
+        }
+        Ok(self.client.copy_out(&args.join(" ")).await?)
+    }
+}
+
+/// Create using [`Client::pagestream`].
+pub struct PagestreamClient {
+    copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+    conn_task: JoinHandle<()>,
+}
+
+pub struct RelTagBlockNo {
+    pub rel_tag: RelTag,
+    pub block_no: u32,
+}
+
+impl PagestreamClient {
+    pub async fn shutdown(mut self) {
+        let _ = self.cancel_on_client_drop.take();
+        self.conn_task.await.unwrap();
+    }
+
+    pub async fn getpage(
+        &mut self,
+        key: RelTagBlockNo,
+        lsn: Lsn,
+    ) -> anyhow::Result<PagestreamGetPageResponse> {
+        let req = PagestreamGetPageRequest {
+            latest: false,
+            rel: key.rel_tag,
+            blkno: key.block_no,
+            lsn,
+        };
+        let req = PagestreamFeMessage::GetPage(req);
+        let req: bytes::Bytes = req.serialize();
+        // let mut req = tokio_util::io::ReaderStream::new(&req);
+        let mut req = tokio_stream::once(Ok(req));
+
+        self.copy_both.send_all(&mut req).await?;
+
+        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+        let next: bytes::Bytes = next.unwrap()?;
+
+        let msg = PagestreamBeMessage::deserialize(next)?;
+        match msg {
+            PagestreamBeMessage::GetPage(p) => Ok(p),
+            PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
+            PagestreamBeMessage::Exists(_)
+            | PagestreamBeMessage::Nblocks(_)
+            | PagestreamBeMessage::DbSize(_) => {
+                anyhow::bail!(
+                    "unexpected be message kind in response to getpage request: {}",
+                    msg.kind()
+                )
+            }
+        }
+    }
+}
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
new file mode 100644
index 0000000000..169d9b7f8e
--- /dev/null
+++ b/pageserver/pagebench/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "pagebench"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+clap.workspace = true
+futures.workspace = true
+hdrhistogram.workspace = true
+humantime.workspace = true
+humantime-serde.workspace = true
+rand.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+tracing.workspace = true
+tokio.workspace = true
+
+pageserver = { path = ".." }
+pageserver_client.workspace = true
+pageserver_api.workspace = true
+utils = { path = "../../libs/utils/" }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
new file mode 100644
index 0000000000..85a3e695de
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -0,0 +1,272 @@
+use anyhow::Context;
+use pageserver_client::page_service::BasebackupRequest;
+
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use rand::prelude::*;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use tracing::{debug, info, instrument};
+
+use std::collections::HashMap;
+use std::num::NonZeroUsize;
+use std::ops::Range;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
+use crate::util::{request_stats, tokio_thread_local_stats};
+
+/// basebackup@LatestLSN
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "localhost:64000")]
+    page_service_host_port: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long, default_value = "1")]
+    num_clients: NonZeroUsize,
+    #[clap(long, default_value = "1.0")]
+    gzip_probability: f64,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    completed_requests: AtomicU64,
+}
+
+impl LiveStats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+struct Target {
+    timeline: TenantTimelineId,
+    lsn_range: Option<Range<Lsn>>,
+}
+
+#[derive(serde::Serialize)]
+struct Output {
+    total: request_stats::Output,
+}
+
+tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
+        main_impl(args, thread_local_stats)
+    })
+}
+
+async fn main_impl(
+    args: Args,
+    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
+) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let timeline = *timeline;
+            // FIXME: this triggers initial logical size calculation
+            // https://github.com/neondatabase/neon/issues/6168
+            let info = mgmt_api_client
+                .timeline_info(timeline.tenant_id, timeline.timeline_id)
+                .await
+                .unwrap();
+            async move {
+                anyhow::Ok(Target {
+                    timeline,
+                    // TODO: support lsn_range != latest LSN
+                    lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
+                })
+            }
+        });
+    }
+    let mut all_targets: Vec<Target> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_targets.push(res.unwrap().unwrap());
+    }
+
+    let live_stats = Arc::new(LiveStats::default());
+
+    let num_client_tasks = timelines.len();
+    let num_live_stats_dump = 1;
+    let num_work_sender_tasks = 1;
+
+    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
+    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
+
+    tokio::spawn({
+        let stats = Arc::clone(&live_stats);
+        let start_work_barrier = Arc::clone(&start_work_barrier);
+        async move {
+            start_work_barrier.wait().await;
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                info!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut work_senders = HashMap::new();
+    let mut tasks = Vec::new();
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
+        work_senders.insert(tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&all_work_done_barrier),
+            Arc::clone(&live_stats),
+        )));
+    }
+
+    let work_sender = async move {
+        start_work_barrier.wait().await;
+        loop {
+            let (timeline, work) = {
+                let mut rng = rand::thread_rng();
+                let target = all_targets.choose(&mut rng).unwrap();
+                let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
+                (
+                    target.timeline,
+                    Work {
+                        lsn,
+                        gzip: rng.gen_bool(args.gzip_probability),
+                    },
+                )
+            };
+            let sender = work_senders.get(&timeline).unwrap();
+            // TODO: what if this blocks?
+            sender.send(work).await.ok().unwrap();
+        }
+    };
+
+    if let Some(runtime) = args.runtime {
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
+    } else {
+        work_sender.await;
+        unreachable!("work sender never terminates");
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+
+    let output = Output {
+        total: {
+            let mut agg_stats = request_stats::Stats::new();
+            for stats in all_thread_local_stats.lock().unwrap().iter() {
+                let stats = stats.lock().unwrap();
+                agg_stats.add(&stats);
+            }
+            agg_stats.output()
+        },
+    };
+
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
+    anyhow::Ok(())
+}
+
+#[derive(Copy, Clone)]
+struct Work {
+    lsn: Option<Lsn>,
+    gzip: bool,
+}
+
+#[instrument(skip_all)]
+async fn client(
+    args: &'static Args,
+    timeline: TenantTimelineId,
+    start_work_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<Work>,
+    all_work_done_barrier: Arc<Barrier>,
+    live_stats: Arc<LiveStats>,
+) {
+    start_work_barrier.wait().await;
+
+    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
+        &args.page_service_host_port,
+        args.pageserver_jwt.as_deref(),
+    ))
+    .await
+    .unwrap();
+
+    while let Some(Work { lsn, gzip }) = work.recv().await {
+        let start = Instant::now();
+        let copy_out_stream = client
+            .basebackup(&BasebackupRequest {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+                lsn,
+                gzip,
+            })
+            .await
+            .with_context(|| format!("start basebackup for {timeline}"))
+            .unwrap();
+
+        use futures::StreamExt;
+        let size = Arc::new(AtomicUsize::new(0));
+        copy_out_stream
+            .for_each({
+                |r| {
+                    let size = Arc::clone(&size);
+                    async move {
+                        let size = Arc::clone(&size);
+                        size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
+                    }
+                }
+            })
+            .await;
+        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
+    }
+
+    all_work_done_barrier.wait().await;
+}
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
new file mode 100644
index 0000000000..16d198ab0e
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -0,0 +1,335 @@
+use anyhow::Context;
+use futures::future::join_all;
+use pageserver::pgdatadir_mapping::key_to_rel_block;
+use pageserver::repository;
+use pageserver_api::key::is_rel_block_key;
+use pageserver_client::page_service::RelTagBlockNo;
+
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+
+use rand::prelude::*;
+use tokio::sync::Barrier;
+use tokio::task::JoinSet;
+use tracing::{info, instrument};
+
+use std::collections::HashMap;
+use std::future::Future;
+use std::num::NonZeroUsize;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
+use crate::util::{request_stats, tokio_thread_local_stats};
+
+/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(long, default_value = "1")]
+    num_clients: NonZeroUsize,
+    #[clap(long)]
+    runtime: Option<humantime::Duration>,
+    #[clap(long)]
+    per_target_rate_limit: Option<usize>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+#[derive(Debug, Default)]
+struct LiveStats {
+    completed_requests: AtomicU64,
+}
+
+impl LiveStats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+#[derive(Clone)]
+struct KeyRange {
+    timeline: TenantTimelineId,
+    timeline_lsn: Lsn,
+    start: i128,
+    end: i128,
+}
+
+impl KeyRange {
+    fn len(&self) -> i128 {
+        self.end - self.start
+    }
+}
+
+#[derive(serde::Serialize)]
+struct Output {
+    total: request_stats::Output,
+}
+
+tokio_thread_local_stats::declare!(STATS: request_stats::Stats);
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    tokio_thread_local_stats::main!(STATS, move |thread_local_stats| {
+        main_impl(args, thread_local_stats)
+    })
+}
+
+async fn main_impl(
+    args: Args,
+    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
+) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    let mut js = JoinSet::new();
+    for timeline in &timelines {
+        js.spawn({
+            let mgmt_api_client = Arc::clone(&mgmt_api_client);
+            let timeline = *timeline;
+            async move {
+                let partitioning = mgmt_api_client
+                    .keyspace(timeline.tenant_id, timeline.timeline_id)
+                    .await?;
+                let lsn = partitioning.at_lsn;
+
+                let ranges = partitioning
+                    .keys
+                    .ranges
+                    .iter()
+                    .filter_map(|r| {
+                        let start = r.start;
+                        let end = r.end;
+                        // filter out non-relblock keys
+                        match (is_rel_block_key(&start), is_rel_block_key(&end)) {
+                            (true, true) => Some(KeyRange {
+                                timeline,
+                                timeline_lsn: lsn,
+                                start: start.to_i128(),
+                                end: end.to_i128(),
+                            }),
+                            (true, false) | (false, true) => {
+                                unimplemented!("split up range")
+                            }
+                            (false, false) => None,
+                        }
+                    })
+                    .collect::<Vec<_>>();
+
+                anyhow::Ok(ranges)
+            }
+        });
+    }
+    let mut all_ranges: Vec<KeyRange> = Vec::new();
+    while let Some(res) = js.join_next().await {
+        all_ranges.extend(res.unwrap().unwrap());
+    }
+
+    let live_stats = Arc::new(LiveStats::default());
+
+    let num_client_tasks = timelines.len();
+    let num_live_stats_dump = 1;
+    let num_work_sender_tasks = 1;
+
+    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
+    ));
+    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
+
+    tokio::spawn({
+        let stats = Arc::clone(&live_stats);
+        let start_work_barrier = Arc::clone(&start_work_barrier);
+        async move {
+            start_work_barrier.wait().await;
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                info!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut work_senders = HashMap::new();
+    let mut tasks = Vec::new();
+    for tl in &timelines {
+        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
+        work_senders.insert(tl, sender);
+        tasks.push(tokio::spawn(client(
+            args,
+            *tl,
+            Arc::clone(&start_work_barrier),
+            receiver,
+            Arc::clone(&all_work_done_barrier),
+            Arc::clone(&live_stats),
+        )));
+    }
+
+    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
+        None => Box::pin(async move {
+            let weights = rand::distributions::weighted::WeightedIndex::new(
+                all_ranges.iter().map(|v| v.len()),
+            )
+            .unwrap();
+
+            start_work_barrier.wait().await;
+
+            loop {
+                let (range, key) = {
+                    let mut rng = rand::thread_rng();
+                    let r = &all_ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = repository::Key::from_i128(key);
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    (r, RelTagBlockNo { rel_tag, block_no })
+                };
+                let sender = work_senders.get(&range.timeline).unwrap();
+                // TODO: what if this blocks?
+                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+            }
+        }),
+        Some(rps_limit) => Box::pin(async move {
+            let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
+
+            let make_timeline_task: &dyn Fn(
+                TenantTimelineId,
+            )
+                -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                let sender = work_senders.get(&timeline).unwrap();
+                let ranges: Vec<KeyRange> = all_ranges
+                    .iter()
+                    .filter(|r| r.timeline == timeline)
+                    .cloned()
+                    .collect();
+                let weights = rand::distributions::weighted::WeightedIndex::new(
+                    ranges.iter().map(|v| v.len()),
+                )
+                .unwrap();
+
+                Box::pin(async move {
+                    let mut ticker = tokio::time::interval(period);
+                    ticker.set_missed_tick_behavior(
+                        /* TODO review this choice */
+                        tokio::time::MissedTickBehavior::Burst,
+                    );
+                    loop {
+                        ticker.tick().await;
+                        let (range, key) = {
+                            let mut rng = rand::thread_rng();
+                            let r = &ranges[weights.sample(&mut rng)];
+                            let key: i128 = rng.gen_range(r.start..r.end);
+                            let key = repository::Key::from_i128(key);
+                            let (rel_tag, block_no) = key_to_rel_block(key)
+                                .expect("we filter non-rel-block keys out above");
+                            (r, RelTagBlockNo { rel_tag, block_no })
+                        };
+                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                    }
+                })
+            };
+
+            let tasks: Vec<_> = work_senders
+                .keys()
+                .map(|tl| make_timeline_task(**tl))
+                .collect();
+
+            start_work_barrier.wait().await;
+
+            join_all(tasks).await;
+        }),
+    };
+
+    if let Some(runtime) = args.runtime {
+        match tokio::time::timeout(runtime.into(), work_sender).await {
+            Ok(()) => unreachable!("work sender never terminates"),
+            Err(_timeout) => {
+                // this implicitly drops the work_senders, making all the clients exit
+            }
+        }
+    } else {
+        work_sender.await;
+        unreachable!("work sender never terminates");
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+
+    let output = Output {
+        total: {
+            let mut agg_stats = request_stats::Stats::new();
+            for stats in all_thread_local_stats.lock().unwrap().iter() {
+                let stats = stats.lock().unwrap();
+                agg_stats.add(&stats);
+            }
+            agg_stats.output()
+        },
+    };
+
+    let output = serde_json::to_string_pretty(&output).unwrap();
+    println!("{output}");
+
+    anyhow::Ok(())
+}
+
+#[instrument(skip_all)]
+async fn client(
+    args: &'static Args,
+    timeline: TenantTimelineId,
+    start_work_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
+    all_work_done_barrier: Arc<Barrier>,
+    live_stats: Arc<LiveStats>,
+) {
+    start_work_barrier.wait().await;
+
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(timeline.tenant_id, timeline.timeline_id)
+        .await
+        .unwrap();
+
+    while let Some((key, lsn)) = work.recv().await {
+        let start = Instant::now();
+        client
+            .getpage(key, lsn)
+            .await
+            .with_context(|| format!("getpage for {timeline}"))
+            .unwrap();
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
+    }
+
+    all_work_done_barrier.wait().await;
+}
diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
new file mode 100644
index 0000000000..d46ae94e8a
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -0,0 +1,85 @@
+use std::sync::Arc;
+
+use humantime::Duration;
+use tokio::task::JoinSet;
+use utils::id::TenantTimelineId;
+
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    mgmt_api_endpoint: String,
+    #[clap(long, default_value = "localhost:64000")]
+    page_service_host_port: String,
+    #[clap(long)]
+    pageserver_jwt: Option<String>,
+    #[clap(
+        long,
+        help = "if specified, poll mgmt api to check whether init logical size calculation has completed"
+    )]
+    poll_for_completion: Option<Duration>,
+    #[clap(long)]
+    limit_to_first_n_targets: Option<usize>,
+    targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let main_task = rt.spawn(main_impl(args));
+    rt.block_on(main_task).unwrap()
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    let args: &'static Args = Box::leak(Box::new(args));
+
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        args.mgmt_api_endpoint.clone(),
+        args.pageserver_jwt.as_deref(),
+    ));
+
+    // discover targets
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: args.limit_to_first_n_targets,
+            targets: args.targets.clone(),
+        },
+    )
+    .await?;
+
+    // kick it off
+
+    let mut js = JoinSet::new();
+    for tl in timelines {
+        let mgmt_api_client = Arc::clone(&mgmt_api_client);
+        js.spawn(async move {
+            // TODO: API to explicitly trigger initial logical size computation.
+            // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
+            // => https://github.com/neondatabase/neon/issues/6168
+            let info = mgmt_api_client
+                .timeline_info(tl.tenant_id, tl.timeline_id)
+                .await
+                .unwrap();
+
+            if let Some(period) = args.poll_for_completion {
+                let mut ticker = tokio::time::interval(period.into());
+                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+                let mut info = info;
+                while !info.current_logical_size_is_accurate {
+                    ticker.tick().await;
+                    info = mgmt_api_client
+                        .timeline_info(tl.tenant_id, tl.timeline_id)
+                        .await
+                        .unwrap();
+                }
+            }
+        });
+    }
+    while let Some(res) = js.join_next().await {
+        let _: () = res.unwrap();
+    }
+    Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
new file mode 100644
index 0000000000..e0120c9212
--- /dev/null
+++ b/pageserver/pagebench/src/main.rs
@@ -0,0 +1,48 @@
+use clap::Parser;
+use utils::logging;
+
+/// Re-usable pieces of code that aren't CLI-specific.
+mod util {
+    pub(crate) mod connstring;
+    pub(crate) mod request_stats;
+    #[macro_use]
+    pub(crate) mod tokio_thread_local_stats;
+    /// Re-usable pieces of CLI-specific code.
+    pub(crate) mod cli {
+        pub(crate) mod targets;
+    }
+}
+
+/// The pagebench CLI sub-commands, dispatched in [`main`] below.
+mod cmd {
+    pub(super) mod basebackup;
+    pub(super) mod getpage_latest_lsn;
+    pub(super) mod trigger_initial_size_calculation;
+}
+
+/// Component-level performance test for pageserver.
+#[derive(clap::Parser)]
+enum Args {
+    Basebackup(cmd::basebackup::Args),
+    GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
+    TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
+}
+
+fn main() {
+    logging::init(
+        logging::LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stderr,
+    )
+    .unwrap();
+
+    let args = Args::parse();
+    match args {
+        Args::Basebackup(args) => cmd::basebackup::main(args),
+        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Args::TriggerInitialSizeCalculation(args) => {
+            cmd::trigger_initial_size_calculation::main(args)
+        }
+    }
+    .unwrap()
+}
diff --git a/pageserver/pagebench/src/util/cli/targets.rs b/pageserver/pagebench/src/util/cli/targets.rs
new file mode 100644
index 0000000000..848eae27cf
--- /dev/null
+++ b/pageserver/pagebench/src/util/cli/targets.rs
@@ -0,0 +1,34 @@
+use std::sync::Arc;
+
+use pageserver_client::mgmt_api;
+use tracing::info;
+use utils::id::TenantTimelineId;
+
+pub(crate) struct Spec {
+    pub(crate) limit_to_first_n_targets: Option<usize>,
+    pub(crate) targets: Option<Vec<TenantTimelineId>>,
+}
+
+pub(crate) async fn discover(
+    api_client: &Arc<mgmt_api::Client>,
+    spec: Spec,
+) -> anyhow::Result<Vec<TenantTimelineId>> {
+    let mut timelines = if let Some(targets) = spec.targets {
+        targets
+    } else {
+        mgmt_api::util::get_pageserver_tenant_timelines_unsharded(api_client).await?
+    };
+
+    if let Some(limit) = spec.limit_to_first_n_targets {
+        timelines.sort(); // for determinism
+        timelines.truncate(limit);
+        if timelines.len() < limit {
+            anyhow::bail!("pageserver has less than limit_to_first_n_targets={limit} tenants");
+        }
+    }
+
+    info!("timelines:\n{:?}", timelines);
+    info!("number of timelines:\n{:?}", timelines.len());
+
+    Ok(timelines)
+}
diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs
new file mode 100644
index 0000000000..07a0ff042d
--- /dev/null
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -0,0 +1,8 @@
+pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
+    let colon_and_jwt = if let Some(jwt) = jwt {
+        format!(":{jwt}") // TODO: urlescape
+    } else {
+        String::new()
+    };
+    format!("postgres://postgres{colon_and_jwt}@{host_port}")
+}
diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs
new file mode 100644
index 0000000000..5ecf1cbf24
--- /dev/null
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -0,0 +1,88 @@
+use std::time::Duration;
+
+use anyhow::Context;
+
+pub(crate) struct Stats {
+    latency_histo: hdrhistogram::Histogram<u64>,
+}
+
+impl Stats {
+    pub(crate) fn new() -> Self {
+        Self {
+            // Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
+            // which would skew the benchmark results.
+            latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
+        }
+    }
+    pub(crate) fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
+        let micros: u64 = latency
+            .as_micros()
+            .try_into()
+            .context("latency greater than u64")?;
+        self.latency_histo
+            .record(micros)
+            .context("add to histogram")?;
+        Ok(())
+    }
+    pub(crate) fn output(&self) -> Output {
+        let latency_percentiles = std::array::from_fn(|idx| {
+            let micros = self
+                .latency_histo
+                .value_at_percentile(LATENCY_PERCENTILES[idx]);
+            Duration::from_micros(micros)
+        });
+        Output {
+            request_count: self.latency_histo.len(),
+            latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
+            latency_percentiles: LatencyPercentiles {
+                latency_percentiles,
+            },
+        }
+    }
+    pub(crate) fn add(&mut self, other: &Self) {
+        let Self {
+            ref mut latency_histo,
+        } = self;
+        latency_histo.add(&other.latency_histo).unwrap();
+    }
+}
+
+impl Default for Stats {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
+
+struct LatencyPercentiles {
+    latency_percentiles: [Duration; 4],
+}
+
+impl serde::Serialize for LatencyPercentiles {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeMap;
+        let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
+        for p in LATENCY_PERCENTILES {
+            ser.serialize_entry(
+                &format!("p{p}"),
+                &format!(
+                    "{}",
+                    &humantime::format_duration(self.latency_percentiles[0])
+                ),
+            )?;
+        }
+        ser.end()
+    }
+}
+
+#[derive(serde::Serialize)]
+pub(crate) struct Output {
+    request_count: u64,
+    #[serde(with = "humantime_serde")]
+    latency_mean: Duration,
+    latency_percentiles: LatencyPercentiles,
+}
diff --git a/pageserver/pagebench/src/util/tokio_thread_local_stats.rs b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
new file mode 100644
index 0000000000..82526213b6
--- /dev/null
+++ b/pageserver/pagebench/src/util/tokio_thread_local_stats.rs
@@ -0,0 +1,45 @@
+pub(crate) type ThreadLocalStats<T> = Arc<Mutex<T>>;
+pub(crate) type AllThreadLocalStats<T> = Arc<Mutex<Vec<ThreadLocalStats<T>>>>;
+
+macro_rules! declare {
+    ($THREAD_LOCAL_NAME:ident: $T:ty) => {
+        thread_local! {
+            pub static $THREAD_LOCAL_NAME: std::cell::RefCell<crate::util::tokio_thread_local_stats::ThreadLocalStats<$T>> = std::cell::RefCell::new(
+                std::sync::Arc::new(std::sync::Mutex::new(Default::default()))
+            );
+        }
+    };
+}
+
+use std::sync::{Arc, Mutex};
+
+pub(crate) use declare;
+
+macro_rules! main {
+    ($THREAD_LOCAL_NAME:ident, $main_impl:expr) => {{
+        let main_impl = $main_impl;
+        let all = Arc::new(Mutex::new(Vec::new()));
+
+        let rt = tokio::runtime::Builder::new_multi_thread()
+            .on_thread_start({
+                let all = Arc::clone(&all);
+                move || {
+                    // pre-initialize the thread local stats by accessesing them
+                    // (some stats like requests_stats::Stats are quite costly to initialize,
+                    //  we don't want to pay that cost during the measurement period)
+                    $THREAD_LOCAL_NAME.with(|stats| {
+                        let stats: Arc<_> = Arc::clone(&*stats.borrow());
+                        all.lock().unwrap().push(stats);
+                    });
+                }
+            })
+            .enable_all()
+            .build()
+            .unwrap();
+
+        let main_task = rt.spawn(main_impl(all));
+        rt.block_on(main_task).unwrap()
+    }};
+}
+
+pub(crate) use main;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index bd63c4d860..8516f397ca 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1468,6 +1468,7 @@ threshold = "20m"
                 period: Duration::from_secs(10),
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
+                eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
             })
         );
         match &conf.default_tenant_conf.eviction_policy {
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 76906cfaf7..23b9b573b6 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig {
     pub period: Duration,
     #[cfg(feature = "testing")]
     pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
+    /// Select sorting for evicted layers
+    #[serde(default)]
+    pub eviction_order: EvictionOrder,
+}
+
+/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
+/// partitioning.
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "type", content = "args")]
+pub enum EvictionOrder {
+    /// Order the layers to be evicted by how recently they have been accessed in absolute
+    /// time.
+    ///
+    /// This strategy is unfair when some tenants grow faster than others towards the slower
+    /// growing.
+    #[default]
+    AbsoluteAccessed,
+
+    /// Order the layers to be evicted by how recently they have been accessed relatively within
+    /// the set of resident layers of a tenant.
+    ///
+    /// This strategy will evict layers more fairly but is untested.
+    RelativeAccessed {
+        #[serde(default)]
+        highest_layer_count_loses_first: bool,
+    },
+}
+
+impl EvictionOrder {
+    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
+    /// counts should be the first ones to have their layers evicted.
+    fn highest_layer_count_loses_first(&self) -> bool {
+        match self {
+            EvictionOrder::AbsoluteAccessed => false,
+            EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first,
+            } => *highest_layer_count_loses_first,
+        }
+    }
 }
 
 #[derive(Default)]
@@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration(
 ) -> anyhow::Result<()> {
     let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
         .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(
+        state,
+        storage,
+        usage_pre,
+        task_config.eviction_order,
+        cancel,
+    )
+    .await;
     match res {
         Ok(outcome) => {
             debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     state: &State,
     _storage: &GenericRemoteStorage,
     usage_pre: U,
+    eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
     // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
         "running disk usage based eviction due to pressure"
     );
 
-    let candidates = match collect_eviction_candidates(cancel).await? {
+    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
         EvictionCandidates::Cancelled => {
             return Ok(IterationOutcome::Cancelled);
         }
@@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
     // Debug-log the list of candidates
     let now = SystemTime::now();
     for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        let nth = i + 1;
         let desc = candidate.layer.layer_desc();
+        let total_candidates = candidates.len();
+        let size = desc.file_size;
+        let rel = candidate.relative_last_activity;
         debug!(
-            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
-            i + 1,
-            candidates.len(),
-            desc.file_size,
+            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
             now.duration_since(candidate.last_activity_ts)
                 .unwrap()
                 .as_micros(),
-            partition,
             desc.tenant_shard_id,
             desc.timeline_id,
             candidate.layer,
@@ -459,6 +506,7 @@ struct EvictionCandidate {
     timeline: Arc<Timeline>,
     layer: Layer,
     last_activity_ts: SystemTime,
+    relative_last_activity: finite_f32::FiniteF32,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -478,24 +526,24 @@ enum EvictionCandidates {
 /// order. A caller that evicts in that order, until pressure is relieved, implements
 /// the eviction policy outlined in the module comment.
 ///
-/// # Example
+/// # Example with EvictionOrder::AbsoluteAccessed
 ///
 /// Imagine that there are two tenants, A and B, with five layers each, a-e.
 /// Each layer has size 100, and both tenant's min_resident_size is 150.
 /// The eviction order would be
 ///
 /// ```text
-/// partition last_activity_ts    tenant/layer
-/// Above     18:30               A/c
-/// Above     19:00               A/b
-/// Above     18:29               B/c
-/// Above     19:05               B/b
-/// Above     20:00               B/a
-/// Above     20:03               A/a
-/// Below     20:30               A/d
-/// Below     20:40               B/d
-/// Below     20:45               B/e
-/// Below     20:58               A/e
+/// partition last_activity_ts tenant/layer
+/// Above     18:30            A/c
+/// Above     19:00            A/b
+/// Above     18:29            B/c
+/// Above     19:05            B/b
+/// Above     20:00            B/a
+/// Above     20:03            A/a
+/// Below     20:30            A/d
+/// Below     20:40            B/d
+/// Below     20:45            B/e
+/// Below     20:58            A/e
 /// ```
 ///
 /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
@@ -505,7 +553,77 @@ enum EvictionCandidates {
 /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
 /// after exhauting the `Above` partition.
 /// So, we did not respect each tenant's min_resident_size.
+///
+/// # Example with EvictionOrder::RelativeAccessed
+///
+/// ```text
+/// partition relative_age last_activity_ts tenant/layer
+/// Above     0/4          18:30            A/c
+/// Above     0/4          18:29            B/c
+/// Above     1/4          19:00            A/b
+/// Above     1/4          19:05            B/b
+/// Above     2/4          20:00            B/a
+/// Above     2/4          20:03            A/a
+/// Below     3/4          20:30            A/d
+/// Below     3/4          20:40            B/d
+/// Below     4/4          20:45            B/e
+/// Below     4/4          20:58            A/e
+/// ```
+///
+/// With tenants having the same number of layers the picture does not change much. The same with
+/// A having many more layers **resident** (not all of them listed):
+///
+/// ```text
+/// Above       0/100      18:30            A/c
+/// Above       0/4        18:29            B/c
+/// Above       1/100      19:00            A/b
+/// Above       2/100      20:03            A/a
+/// Above       3/100      20:03            A/nth_3
+/// Above       4/100      20:03            A/nth_4
+///             ...
+/// Above       1/4        19:05            B/b
+/// Above      25/100      20:04            A/nth_25
+///             ...
+/// Above       2/4        20:00            B/a
+/// Above      50/100      20:10            A/nth_50
+///             ...
+/// Below       3/4        20:40            B/d
+/// Below      99/100      20:30            A/nth_99
+/// Below       4/4        20:45            B/e
+/// Below     100/100      20:58            A/nth_100
+/// ```
+///
+/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is
+/// difficult to see is what happens on the next round assuming the evicting 23 from the above list
+/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has
+/// appeared:
+///
+/// ```text
+/// Above       0/87       20:04            A/nth_23
+/// Above       0/3        19:05            B/b
+/// Above       0/50       20:59            C/nth_0
+/// Above       1/87       20:04            A/nth_24
+/// Above       1/50       21:00            C/nth_1
+/// Above       2/87       20:04            A/nth_25
+///             ...
+/// Above      16/50       21:02            C/nth_16
+/// Above       1/3        20:00            B/a
+/// Above      27/87       20:10            A/nth_50
+///             ...
+/// Below       2/3        20:40            B/d
+/// Below      49/50       21:05            C/nth_49
+/// Below      86/87       20:30            A/nth_99
+/// Below       3/3        20:45            B/e
+/// Below      50/50       21:05            C/nth_50
+/// Below      87/87       20:58            A/nth_100
+/// ```
+///
+/// Now relieving pressure with 23 layers would cost:
+/// - tenant A 14 layers
+/// - tenant B 1 layer
+/// - tenant C 8 layers
 async fn collect_eviction_candidates(
+    eviction_order: EvictionOrder,
     cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
     // get a snapshot of the list of tenants
@@ -591,12 +709,63 @@ async fn collect_eviction_candidates(
         tenant_candidates
             .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
         let mut cumsum: i128 = 0;
-        for (timeline, layer_info) in tenant_candidates.into_iter() {
+
+        // keeping the -1 or not decides if every tenant should lose their least recently accessed
+        // layer OR if this should happen in the order of having highest layer count:
+        let fudge = if eviction_order.highest_layer_count_loses_first() {
+            // relative_age vs. tenant layer count:
+            // - 0.1..=1.0 (10 layers)
+            // - 0.01..=1.0 (100 layers)
+            // - 0.001..=1.0 (1000 layers)
+            //
+            // leading to evicting less of the smallest tenants.
+            0
+        } else {
+            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+            // be that less than 10k layer evictions is enough, so we would not need to evict from
+            // all tenants.
+            //
+            // as the tenant ordering is now deterministic this could hit the same tenants
+            // disproportionetly on multiple invocations. alternative could be to remember how many
+            // layers did we evict last time from this tenant, and inject that as an additional
+            // fudge here.
+            1
+        };
+
+        let total = tenant_candidates
+            .len()
+            .checked_sub(fudge)
+            .filter(|&x| x > 0)
+            // support 0 or 1 resident layer tenants as well
+            .unwrap_or(1);
+        let divider = total as f32;
+
+        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
             let file_size = layer_info.file_size();
+
+            // as we iterate this reverse sorted list, the most recently accessed layer will always
+            // be 1.0; this is for us to evict it last.
+            let relative_last_activity = if matches!(
+                eviction_order,
+                EvictionOrder::RelativeAccessed { .. }
+            ) {
+                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
+                // similarly for u16. unsure how it would help.
+                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            } else {
+                finite_f32::FiniteF32::ZERO
+            };
+
             let candidate = EvictionCandidate {
                 timeline,
                 last_activity_ts: layer_info.last_activity_ts,
                 layer: layer_info.layer,
+                relative_last_activity,
             };
             let partition = if cumsum > min_resident_size as i128 {
                 MinResidentSizePartition::Above
@@ -610,8 +779,19 @@ async fn collect_eviction_candidates(
 
     debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
         "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+
+    match eviction_order {
+        EvictionOrder::AbsoluteAccessed => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.last_activity_ts)
+            });
+        }
+        EvictionOrder::RelativeAccessed { .. } => {
+            candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            });
+        }
+    }
 
     Ok(EvictionCandidates::Finished(candidates))
 }
@@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey {
     }
 }
 
+/// A totally ordered f32 subset we can use with sorting functions.
+mod finite_f32 {
+
+    /// A totally ordered f32 subset we can use with sorting functions.
+    #[derive(Clone, Copy, PartialEq)]
+    pub struct FiniteF32(f32);
+
+    impl std::fmt::Debug for FiniteF32 {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            std::fmt::Debug::fmt(&self.0, f)
+        }
+    }
+
+    impl std::fmt::Display for FiniteF32 {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            std::fmt::Display::fmt(&self.0, f)
+        }
+    }
+
+    impl std::cmp::Eq for FiniteF32 {}
+
+    impl std::cmp::PartialOrd for FiniteF32 {
+        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    impl std::cmp::Ord for FiniteF32 {
+        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+            self.0.total_cmp(&other.0)
+        }
+    }
+
+    impl TryFrom<f32> for FiniteF32 {
+        type Error = f32;
+
+        fn try_from(value: f32) -> Result<Self, Self::Error> {
+            if value.is_finite() {
+                Ok(FiniteF32(value))
+            } else {
+                Err(value)
+            }
+        }
+    }
+
+    impl FiniteF32 {
+        pub const ZERO: FiniteF32 = FiniteF32(0.0);
+
+        pub fn try_from_normalized(value: f32) -> Result<Self, f32> {
+            if (0.0..=1.0).contains(&value) {
+                // -0.0 is within the range, make sure it is assumed 0.0..=1.0
+                let value = value.abs();
+                Ok(FiniteF32(value))
+            } else {
+                Err(value)
+            }
+        }
+    }
+}
+
 mod filesystem_level_usage {
     use anyhow::Context;
     use camino::Utf8Path;
@@ -721,6 +961,7 @@ mod filesystem_level_usage {
 
     #[test]
     fn max_usage_pct_pressure() {
+        use super::EvictionOrder;
         use super::Usage as _;
         use std::time::Duration;
         use utils::serde_percent::Percent;
@@ -732,6 +973,7 @@ mod filesystem_level_usage {
                 period: Duration::MAX,
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
+                eviction_order: EvictionOrder::default(),
             },
             total_bytes: 100_000,
             avail_bytes: 0,
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index b79c5ada9a..1fbca1086f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -159,6 +159,12 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/ConflictError"
+        "412":
+          description: Deletion may not proceed, tenant is not in Active state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"
         "500":
           description: Generic operation error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 601fad5bde..11a3a2c872 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,6 +14,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::TenantDetails;
 use pageserver_api::models::{
     DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
     TenantLoadRequest, TenantLocationConfigRequest,
@@ -307,6 +308,7 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
             SlotUpsertError(e) => e.into(),
             Other(o) => ApiError::InternalServerError(o),
             e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+            Cancelled => ApiError::ShuttingDown,
         }
     }
 }
@@ -592,8 +594,6 @@ async fn get_lsn_by_timestamp_handler(
         )));
     }
 
-    let version: Option<u8> = parse_query_param(&request, "version")?;
-
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let timestamp_raw = must_get_query_param(&request, "timestamp")?;
     let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -606,31 +606,18 @@ async fn get_lsn_by_timestamp_handler(
     let result = timeline
         .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
         .await?;
-
-    if version.unwrap_or(0) > 1 {
-        #[derive(serde::Serialize)]
-        struct Result {
-            lsn: Lsn,
-            kind: &'static str,
-        }
-        let (lsn, kind) = match result {
-            LsnForTimestamp::Present(lsn) => (lsn, "present"),
-            LsnForTimestamp::Future(lsn) => (lsn, "future"),
-            LsnForTimestamp::Past(lsn) => (lsn, "past"),
-            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
-        };
-        json_response(StatusCode::OK, Result { lsn, kind })
-    } else {
-        // FIXME: this is a temporary crutch not to break backwards compatibility
-        // See https://github.com/neondatabase/neon/pull/5608
-        let result = match result {
-            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-            LsnForTimestamp::Future(_lsn) => "future".into(),
-            LsnForTimestamp::Past(_lsn) => "past".into(),
-            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-        };
-        json_response(StatusCode::OK, result)
+    #[derive(serde::Serialize)]
+    struct Result {
+        lsn: Lsn,
+        kind: &'static str,
     }
+    let (lsn, kind) = match result {
+        LsnForTimestamp::Present(lsn) => (lsn, "present"),
+        LsnForTimestamp::Future(lsn) => (lsn, "future"),
+        LsnForTimestamp::Past(lsn) => (lsn, "past"),
+        LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+    };
+    json_response(StatusCode::OK, Result { lsn, kind })
 }
 
 async fn get_timestamp_of_lsn_handler(
@@ -872,11 +859,14 @@ async fn tenant_status(
         }
 
         let state = tenant.current_state();
-        Result::<_, ApiError>::Ok(TenantInfo {
-            id: tenant_shard_id,
-            state: state.clone(),
-            current_physical_size: Some(current_physical_size),
-            attachment_status: state.attachment_status(),
+        Result::<_, ApiError>::Ok(TenantDetails {
+            tenant_info: TenantInfo {
+                id: tenant_shard_id,
+                state: state.clone(),
+                current_physical_size: Some(current_physical_size),
+                attachment_status: state.attachment_status(),
+            },
+            timelines: tenant.list_timeline_ids(),
         })
     }
     .instrument(info_span!("tenant_status_handler",
@@ -897,7 +887,9 @@ async fn tenant_delete_handler(
 
     let state = get_state(&request);
 
-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
+    state
+        .tenant_manager
+        .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
         .instrument(info_span!("tenant_delete_handler",
             tenant_id = %tenant_shard_id.tenant_id,
             shard = %tenant_shard_id.shard_slug()
@@ -1577,19 +1569,22 @@ async fn disk_usage_eviction_run(
     struct Config {
         /// How many bytes to evict before reporting that pressure is relieved.
         evict_bytes: u64,
+
+        #[serde(default)]
+        eviction_order: crate::disk_usage_eviction_task::EvictionOrder,
     }
 
     #[derive(Debug, Clone, Copy, serde::Serialize)]
     struct Usage {
         // remains unchanged after instantiation of the struct
-        config: Config,
+        evict_bytes: u64,
         // updated by `add_available_bytes`
         freed_bytes: u64,
     }
 
     impl crate::disk_usage_eviction_task::Usage for Usage {
         fn has_pressure(&self) -> bool {
-            self.config.evict_bytes > self.freed_bytes
+            self.evict_bytes > self.freed_bytes
         }
 
         fn add_available_bytes(&mut self, bytes: u64) {
@@ -1600,7 +1595,7 @@ async fn disk_usage_eviction_run(
     let config = json_request::<Config>(&mut r).await?;
 
     let usage = Usage {
-        config,
+        evict_bytes: config.evict_bytes,
         freed_bytes: 0,
     };
 
@@ -1615,7 +1610,11 @@ async fn disk_usage_eviction_run(
     let state = state.disk_usage_eviction_state.clone();
 
     let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state, storage, usage, &cancel,
+        &state,
+        storage,
+        usage,
+        config.eviction_order,
+        &cancel,
     )
     .await;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index b81037ae47..e9884a15f5 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1776,6 +1776,7 @@ pub fn is_inherited_key(key: Key) -> bool {
     key != AUX_FILES_KEY
 }
 
+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
     Ok(match key.field1 {
         0x00 => (
@@ -1790,7 +1791,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
         _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
     })
 }
-
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
     key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index eceef6bf78..2f2169d194 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1552,6 +1552,10 @@ impl Tenant {
             .collect()
     }
 
+    pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
+        self.timelines.lock().unwrap().keys().cloned().collect()
+    }
+
     /// This is used to create the initial 'main' timeline during bootstrapping,
     /// or when importing a new base backup. The caller is expected to load an
     /// initial image of the datadir to the new timeline after this.
@@ -3130,6 +3134,7 @@ impl Tenant {
 
     /// For unit tests, make this visible so that other modules can directly create timelines
     #[cfg(test)]
+    #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
     pub(crate) async fn bootstrap_timeline_test(
         &self,
         timeline_id: TimelineId,
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index e8491f26db..b21bad51ba 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -48,6 +48,9 @@ pub(crate) enum DeleteTenantError {
     #[error("Timeline {0}")]
     Timeline(#[from] DeleteTimelineError),
 
+    #[error("Cancelled")]
+    Cancelled,
+
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index b2f14db9f7..62922e8c99 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -514,10 +514,7 @@ pub async fn init_tenant_mgr(
             &ctx,
         ) {
             Ok(tenant) => {
-                tenants.insert(
-                    TenantShardId::unsharded(tenant.tenant_id()),
-                    TenantSlot::Attached(tenant),
-                );
+                tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant));
             }
             Err(e) => {
                 error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
@@ -962,35 +959,27 @@ impl TenantManager {
         }
 
         let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
+
+        // Directory structure is the same for attached and secondary modes:
+        // create it if it doesn't exist.  Timeline load/creation expects the
+        // timelines/ subdir to already exist.
+        //
+        // Does not need to be fsync'd because local storage is just a cache.
+        tokio::fs::create_dir_all(&timelines_path)
+            .await
+            .with_context(|| format!("Creating {timelines_path}"))?;
+
+        // Before activating either secondary or attached mode, persist the
+        // configuration, so that on restart we will re-attach (or re-start
+        // secondary) on the tenant.
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+            .await
+            .map_err(SetNewTenantConfigError::Persist)?;
 
         let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => {
-                // Directory doesn't need to be fsync'd because if we crash it can
-                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
-                    .await
-                    .with_context(|| format!("Creating {tenant_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
-                TenantSlot::Secondary
-            }
+            LocationMode::Secondary(_) => TenantSlot::Secondary,
             LocationMode::Attached(_attach_config) => {
-                let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-
-                // Directory doesn't need to be fsync'd because we do not depend on
-                // it to exist after crashes: it may be recreated when tenant is
-                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                tokio::fs::create_dir_all(&tenant_path)
-                    .await
-                    .with_context(|| format!("Creating {timelines_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
                 let shard_identity = new_location_config.shard;
                 let tenant = tenant_spawn(
                     self.conf,
@@ -1102,6 +1091,71 @@ impl TenantManager {
                 .collect(),
         }
     }
+
+    pub(crate) async fn delete_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+        activation_timeout: Duration,
+    ) -> Result<(), DeleteTenantError> {
+        // We acquire a SlotGuard during this function to protect against concurrent
+        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
+        // have to return the Tenant to the map while the background deletion runs.
+        //
+        // TODO: refactor deletion to happen outside the lifetime of a Tenant.
+        // Currently, deletion requires a reference to the tenants map in order to
+        // keep the Tenant in the map until deletion is complete, and then remove
+        // it at the end.
+        //
+        // See https://github.com/neondatabase/neon/issues/5080
+
+        let slot_guard =
+            tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+
+        // unwrap is safe because we used MustExist mode when acquiring
+        let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
+            TenantSlot::Attached(tenant) => tenant.clone(),
+            _ => {
+                // Express "not attached" as equivalent to "not found"
+                return Err(DeleteTenantError::NotAttached);
+            }
+        };
+
+        match tenant.current_state() {
+            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
+                // If a tenant is broken or stopping, DeleteTenantFlow can
+                // handle it: broken tenants proceed to delete, stopping tenants
+                // are checked for deletion already in progress.
+            }
+            _ => {
+                tenant
+                    .wait_to_become_active(activation_timeout)
+                    .await
+                    .map_err(|e| match e {
+                        GetActiveTenantError::WillNotBecomeActive(_) => {
+                            DeleteTenantError::InvalidState(tenant.current_state())
+                        }
+                        GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
+                        GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
+                        GetActiveTenantError::WaitForActiveTimeout {
+                            latest_state: _latest_state,
+                            wait_time: _wait_time,
+                        } => DeleteTenantError::InvalidState(tenant.current_state()),
+                    })?;
+            }
+        }
+
+        let result = DeleteTenantFlow::run(
+            self.conf,
+            self.resources.remote_storage.clone(),
+            &TENANTS,
+            tenant,
+        )
+        .await;
+
+        // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
+        slot_guard.revert();
+        result
+    }
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -1279,41 +1333,6 @@ pub(crate) async fn get_active_tenant_with_timeout(
     Ok(tenant)
 }
 
-pub(crate) async fn delete_tenant(
-    conf: &'static PageServerConf,
-    remote_storage: Option<GenericRemoteStorage>,
-    tenant_shard_id: TenantShardId,
-) -> Result<(), DeleteTenantError> {
-    // We acquire a SlotGuard during this function to protect against concurrent
-    // changes while the ::prepare phase of DeleteTenantFlow executes, but then
-    // have to return the Tenant to the map while the background deletion runs.
-    //
-    // TODO: refactor deletion to happen outside the lifetime of a Tenant.
-    // Currently, deletion requires a reference to the tenants map in order to
-    // keep the Tenant in the map until deletion is complete, and then remove
-    // it at the end.
-    //
-    // See https://github.com/neondatabase/neon/issues/5080
-
-    // TODO(sharding): make delete API sharding-aware
-    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
-
-    // unwrap is safe because we used MustExist mode when acquiring
-    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
-        TenantSlot::Attached(tenant) => tenant.clone(),
-        _ => {
-            // Express "not attached" as equivalent to "not found"
-            return Err(DeleteTenantError::NotAttached);
-        }
-    };
-
-    let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
-
-    // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
-    slot_guard.revert();
-    result
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum DeleteTimelineError {
     #[error("Tenant {0}")]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 52ee8f49ce..1b0cf39fbe 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,15 +2192,6 @@ mod tests {
 
         let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();
 
-        let timeline_path = test_state.harness.timeline_path(&TIMELINE_ID);
-        let remote_timeline_dir = test_state.harness.remote_fs_dir.join(
-            timeline_path
-                .strip_prefix(&test_state.harness.conf.workdir)
-                .unwrap(),
-        );
-
-        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
-
         let index_path = test_state.harness.remote_fs_dir.join(
             remote_index_path(
                 &test_state.harness.tenant_shard_id,
@@ -2209,6 +2200,10 @@ mod tests {
             )
             .get_path(),
         );
+
+        std::fs::create_dir_all(index_path.parent().unwrap())
+            .expect("creating test dir should work");
+
         eprintln!("Writing {index_path}");
         std::fs::write(&index_path, index_part_bytes).unwrap();
         example_index_part
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 9a8ddc1a6b..8ae911b31e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -878,6 +878,23 @@ impl LayerInner {
                         Ok(())
                     }
                     Err(e) => {
+                        let consecutive_failures =
+                            this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+
+                        let backoff = utils::backoff::exponential_backoff_duration_seconds(
+                            consecutive_failures.min(u32::MAX as usize) as u32,
+                            1.5,
+                            60.0,
+                        );
+
+                        let backoff = std::time::Duration::from_secs_f64(backoff);
+
+                        tokio::select! {
+                            _ = tokio::time::sleep(backoff) => {},
+                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
+                            _ = timeline.cancel.cancelled() => {},
+                        };
+
                         Err(e)
                     }
                 };
@@ -926,21 +943,9 @@ impl LayerInner {
                 Ok(permit)
             }
             Ok((Err(e), _permit)) => {
-                // FIXME: this should be with the spawned task and be cancellation sensitive
-                //
-                // while we should not need this, this backoff has turned out to be useful with
-                // a bug of unexpectedly deleted remote layer file (#5787).
-                let consecutive_failures =
-                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
+                // sleep already happened in the spawned task, if it was not cancelled
+                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
                 tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
-                let backoff = utils::backoff::exponential_backoff_duration_seconds(
-                    consecutive_failures.min(u32::MAX as usize) as u32,
-                    1.5,
-                    60.0,
-                );
-                let backoff = std::time::Duration::from_secs_f64(backoff);
-
-                tokio::time::sleep(backoff).await;
                 Err(DownloadError::DownloadFailed)
             }
             Err(_gone) => Err(DownloadError::DownloadCancelled),
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 7bfa246eeb..5a5b3d7586 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -138,7 +138,7 @@ pub(super) async fn connection_manager_loop_step(
                     Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
                     Err(status) => {
                         match status.code() {
-                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") => {
                                 // tonic's error handling doesn't provide a clear code for disconnections: we get
                                 // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
                                 info!("broker disconnected: {status}");
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 16b245c488..1d14214030 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1612,6 +1612,7 @@ impl<'a> WalIngest<'a> {
 mod tests {
     use super::*;
     use crate::tenant::harness::*;
+    use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
     use crate::tenant::Timeline;
     use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
     use postgres_ffi::RELSEG_SIZE;
@@ -2177,21 +2178,25 @@ mod tests {
         let pg_version = 15; // The test data was generated by pg15
         let path = "test_data/sk_wal_segment_from_pgbench";
         let wal_segment_path = format!("{path}/000000010000000000000001.zst");
+        let source_initdb_path = format!("{path}/{INITDB_PATH}");
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
         let endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
+        let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID);
+        let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path());
+
+        std::fs::create_dir_all(initdb_path.parent().unwrap())
+            .expect("creating test dir should work");
+        std::fs::copy(source_initdb_path, initdb_path).expect("copying the initdb.tar.zst works");
+
         // Bootstrap a real timeline. We can't use create_test_timeline because
         // it doesn't create a real checkpoint, and Walingest::new tries to parse
         // the garbage data.
-        //
-        // TODO use the initdb.tar.zst file stored with the test data to avoid
-        //      problems with inconsistent initdb results after pg minor version bumps.
-        let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
-            .unwrap()
-            .load()
-            .await;
         let tline = tenant
-            .bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
+            .bootstrap_timeline_test(TIMELINE_ID, pg_version, Some(TIMELINE_ID), &ctx)
             .await
             .unwrap();
 
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 466e346e46..c6b224a14d 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,6 +9,7 @@ OBJS = \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
+	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 2e7da671f9..e467a9c43a 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -19,20 +19,21 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+
+#include <curl/curl.h>
+
+#include "access/xact.h"
+#include "commands/defrem.h"
+#include "fmgr.h"
+#include "libpq/crypt.h"
+#include "miscadmin.h"
 #include "tcop/pquery.h"
 #include "tcop/utility.h"
-#include "access/xact.h"
+#include "utils/acl.h"
+#include "utils/guc.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include <curl/curl.h>
 #include "utils/jsonb.h"
-#include "libpq/crypt.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
 
diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c
index fbbb8fd448..d9a75142f1 100644
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -1,4 +1,3 @@
-
 /*-------------------------------------------------------------------------
  *
  * extension_server.c
@@ -10,21 +9,11 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
-#include "tcop/pquery.h"
-#include "tcop/utility.h"
-#include "access/xact.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include "fmgr.h"
 
 #include <curl/curl.h>
 
+#include "utils/guc.h"
+
 static int	extension_server_port = 0;
 
 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 54b3661e66..6725ce8fff 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -13,32 +13,30 @@
  *-------------------------------------------------------------------------
  */
 
+#include "postgres.h"
+
 #include <sys/file.h>
 #include <unistd.h>
 #include <fcntl.h>
 
-#include "postgres.h"
-
 #include "neon_pgversioncompat.h"
 
+#include "access/parallel.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "pgstat.h"
 #include "pagestore_client.h"
-#include "access/parallel.h"
+#include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
 #include "storage/buf_internals.h"
-#include "storage/latch.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/latch.h"
 #include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/builtins.h"
 #include "utils/dynahash.h"
 #include "utils/guc.h"
-#include "storage/fd.h"
-#include "storage/pg_shmem.h"
-#include "storage/buf_internals.h"
-#include "pgstat.h"
 
 /*
  * Local file cache is used to temporary store relations pages in local file system.
@@ -102,8 +100,6 @@ static shmem_request_hook_type prev_shmem_request_hook;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
-void		PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
-
 /*
  * Local file cache is optional and Neon can work without it.
  * In case of any any errors with this cache, we should disable it but to not throw error.
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 16406ce8a3..3b038f906f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -14,28 +14,24 @@
  */
 #include "postgres.h"
 
-#include "pagestore_client.h"
-#include "fmgr.h"
 #include "access/xlog.h"
-#include "access/xlogutils.h"
-#include "storage/buf_internals.h"
-#include "storage/lwlock.h"
-#include "storage/ipc.h"
-#include "storage/pg_shmem.h"
-#include "c.h"
-#include "postmaster/interrupt.h"
-
+#include "fmgr.h"
 #include "libpq-fe.h"
-#include "libpq/pqformat.h"
 #include "libpq/libpq.h"
-
+#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "postmaster/interrupt.h"
+#include "storage/buf_internals.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "utils/guc.h"
 
 #include "neon.h"
-#include "walproposer.h"
 #include "neon_utils.h"
+#include "pagestore_client.h"
+#include "walproposer.h"
 
 #define PageStoreTrace DEBUG5
 
@@ -62,8 +58,8 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;
 
-int			n_reconnect_attempts = 0;
-int			max_reconnect_attempts = 60;
+static int n_reconnect_attempts = 0;
+static int max_reconnect_attempts = 60;
 
 #define MAX_PAGESERVER_CONNSTRING_SIZE 256
 
@@ -83,8 +79,6 @@ static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 
-bool		(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
-
 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);
 
@@ -627,8 +621,6 @@ pg_init_libpagestore(void)
 		smgr_hook = smgr_neon;
 		smgr_init_hook = smgr_init_neon;
 		dbsize_hook = neon_dbsize;
-		old_redo_read_buffer_filter = redo_read_buffer_filter;
-		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}
 
 	lfc_init();
diff --git a/pgxn/neon/libpqwalproposer.h b/pgxn/neon/libpqwalproposer.h
new file mode 100644
index 0000000000..cd7e568a47
--- /dev/null
+++ b/pgxn/neon/libpqwalproposer.h
@@ -0,0 +1,96 @@
+/*
+ * Interface to set of libpq wrappers walproposer and neon_walreader need.
+ * Similar to libpqwalreceiver, but it has blocking connection establishment and
+ * pqexec which don't fit us. Implementation is at walproposer_pg.c.
+ */
+#ifndef ___LIBPQWALPROPOSER_H__
+#define ___LIBPQWALPROPOSER_H__
+
+/* Re-exported and modified ExecStatusType */
+typedef enum
+{
+	/* We received a single CopyBoth result */
+	WP_EXEC_SUCCESS_COPYBOTH,
+
+	/*
+	 * Any success result other than a single CopyBoth was received. The
+	 * specifics of the result were already logged, but it may be useful to
+	 * provide an error message indicating which safekeeper messed up.
+	 *
+	 * Do not expect PQerrorMessage to be appropriately set.
+	 */
+	WP_EXEC_UNEXPECTED_SUCCESS,
+
+	/*
+	 * No result available at this time. Wait until read-ready, then call
+	 * again. Internally, this is returned when PQisBusy indicates that
+	 * PQgetResult would block.
+	 */
+	WP_EXEC_NEEDS_INPUT,
+	/* Catch-all failure. Check PQerrorMessage. */
+	WP_EXEC_FAILED,
+} WalProposerExecStatusType;
+
+/* Possible return values from walprop_async_read */
+typedef enum
+{
+	/* The full read was successful. buf now points to the data */
+	PG_ASYNC_READ_SUCCESS,
+
+	/*
+	 * The read is ongoing. Wait until the connection is read-ready, then try
+	 * again.
+	 */
+	PG_ASYNC_READ_TRY_AGAIN,
+	/* Reading failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_READ_FAIL,
+} PGAsyncReadResult;
+
+/* Possible return values from walprop_async_write */
+typedef enum
+{
+	/* The write fully completed */
+	PG_ASYNC_WRITE_SUCCESS,
+
+	/*
+	 * The write started, but you'll need to call PQflush some more times to
+	 * finish it off. We just tried, so it's best to wait until the connection
+	 * is read- or write-ready to try again.
+	 *
+	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
+	 * becomes write-ready, just call PQflush.
+	 */
+	PG_ASYNC_WRITE_TRY_FLUSH,
+	/* Writing failed. Check PQerrorMessage(conn) */
+	PG_ASYNC_WRITE_FAIL,
+} PGAsyncWriteResult;
+
+/*
+ * This header is included by walproposer.h to define walproposer_api; if we're
+ * building walproposer without pg, ignore libpq part, leaving only interface
+ * types.
+ */
+#ifndef WALPROPOSER_LIB
+
+#include "libpq-fe.h"
+
+/*
+ * Sometimes working directly with underlying PGconn is simpler, export the
+ * whole thing for simplicity.
+ */
+typedef struct WalProposerConn
+{
+	PGconn	   *pg_conn;
+	bool		is_nonblocking; /* whether the connection is non-blocking */
+	char	   *recvbuf;		/* last received CopyData message from
+								 * walprop_async_read */
+} WalProposerConn;
+
+extern WalProposerConn *libpqwp_connect_start(char *conninfo);
+extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
+extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
+extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
+extern void libpqwp_disconnect(WalProposerConn *conn);
+
+#endif							/* WALPROPOSER_LIB */
+#endif							/* ___LIBPQWALPROPOSER_H__ */
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 897a8373a1..c3afecc679 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -27,13 +27,6 @@ extern void pg_init_walproposer(void);
 
 extern void pg_init_extension_server(void);
 
-/*
- * Returns true if we shouldn't do REDO on that block in record indicated by
- * block_id; false otherwise.
- */
-extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
-
 extern uint64 BackpressureThrottlingTime(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
 
diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
index 807d2decf6..9135847aaf 100644
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -3,33 +3,8 @@
 
 #include "postgres.h"
 
-#include "access/timeline.h"
-#include "access/xlogutils.h"
-#include "common/logging.h"
-#include "common/ip.h"
-#include "funcapi.h"
-#include "libpq/libpq.h"
+#include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
-#include "miscadmin.h"
-#include "postmaster/interrupt.h"
-#include "replication/slot.h"
-#include "replication/walsender_private.h"
-
-#include "storage/ipc.h"
-#include "utils/builtins.h"
-#include "utils/ps_status.h"
-
-#include "libpq-fe.h"
-#include <netinet/tcp.h>
-#include <unistd.h>
-
-#if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
-#include "access/xlogrecovery.h"
-#endif
-#if PG_MAJORVERSION_NUM >= 16
-#include "utils/guc.h"
-#endif
 
 /*
  * Convert a character which represents a hexadecimal digit to an integer.
diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
index 20745d8b26..a86f1e061c 100644
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -1,8 +1,6 @@
 #ifndef __NEON_UTILS_H__
 #define __NEON_UTILS_H__
 
-#include "postgres.h"
-
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
new file mode 100644
index 0000000000..f7ec9e5bfa
--- /dev/null
+++ b/pgxn/neon/neon_walreader.c
@@ -0,0 +1,742 @@
+/*
+ * Like WALRead, but when WAL segment doesn't exist locally instead of throwing
+ * ERROR asynchronously tries to fetch it from the most advanced safekeeper.
+ *
+ * We can't use libpqwalreceiver as it blocks during connection establishment
+ * (and waiting for PQExec result), so use libpqwalproposer instead.
+ *
+ * TODO: keepalives are currently never sent, so the other side can close the
+ * connection prematurely.
+ *
+ * TODO: close conn if reading takes too long to prevent stuck connections.
+ */
+#include "postgres.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/xlog_internal.h"
+#include "access/xlogdefs.h"
+#include "access/xlogreader.h"
+#include "libpq/pqformat.h"
+#include "storage/fd.h"
+#include "utils/wait_event.h"
+
+#include "libpq-fe.h"
+
+#include "neon_walreader.h"
+#include "walproposer.h"
+
+#define NEON_WALREADER_ERR_MSG_LEN 512
+
+/*
+ * Can be called where NeonWALReader *state is available in the context, adds log_prefix.
+ */
+#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
+
+static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
+static void NeonWALReaderResetRemote(NeonWALReader *state);
+static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
+static void neon_wal_segment_close(NeonWALReader *state);
+static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
+								  TimeLineID tli);
+
+/*
+ * State of connection to donor safekeeper.
+ */
+typedef enum
+{
+	/* no remote connection */
+	RS_NONE,
+	/* doing PQconnectPoll, need readable socket */
+	RS_CONNECTING_READ,
+	/* doing PQconnectPoll, need writable socket */
+	RS_CONNECTING_WRITE,
+	/* Waiting for START_REPLICATION result */
+	RS_WAIT_EXEC_RESULT,
+	/* replication stream established */
+	RS_ESTABLISHED,
+} NeonWALReaderRemoteState;
+
+struct NeonWALReader
+{
+	/*
+	 * LSN before which we assume WAL is not available locally. Exists because
+	 * though first segment after startup always exists, part before
+	 * basebackup LSN is filled with zeros.
+	 */
+	XLogRecPtr	available_lsn;
+	WALSegmentContext segcxt;
+	WALOpenSegment seg;
+	int			wre_errno;
+	/* Explains failure to read, static for simplicity. */
+	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];
+
+	/*
+	 * Saved info about request in progress, used to check validity of
+	 * arguments after resume and remember how far we accomplished it. req_lsn
+	 * is 0 if there is no request in progress.
+	 */
+	XLogRecPtr	req_lsn;
+	Size		req_len;
+	Size		req_progress;
+	WalProposer *wp;			/* we learn donor through walproposer */
+	char		donor_name[64]; /* saved donor safekeeper name for logging */
+	/* state of connection to safekeeper */
+	NeonWALReaderRemoteState rem_state;
+	WalProposerConn *wp_conn;
+
+	/*
+	 * position in wp_conn recvbuf from which we'll copy WAL next time, or
+	 * NULL if there is no unprocessed message
+	 */
+	char	   *wal_ptr;
+	Size		wal_rem_len;	/* how many unprocessed bytes left in recvbuf */
+
+	/*
+	 * LSN of wal_ptr position according to walsender to cross check against
+	 * read request
+	 */
+	XLogRecPtr	rem_lsn;
+
+	/* prepended to lines logged by neon_walreader, if provided */
+	char		log_prefix[64];
+};
+
+/* palloc and initialize NeonWALReader */
+NeonWALReader *
+NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
+{
+	NeonWALReader *reader;
+
+	reader = (NeonWALReader *)
+		palloc_extended(sizeof(NeonWALReader),
+						MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
+	if (!reader)
+		return NULL;
+
+	reader->available_lsn = available_lsn;
+	reader->seg.ws_file = -1;
+	reader->seg.ws_segno = 0;
+	reader->seg.ws_tli = 0;
+	reader->segcxt.ws_segsize = wal_segment_size;
+
+	reader->wp = wp;
+
+	reader->rem_state = RS_NONE;
+
+	if (log_prefix)
+		strlcpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
+
+	return reader;
+}
+
+void
+NeonWALReaderFree(NeonWALReader *state)
+{
+	if (state->seg.ws_file != -1)
+		neon_wal_segment_close(state);
+	if (state->wp_conn)
+		libpqwp_disconnect(state->wp_conn);
+	pfree(state);
+}
+
+/*
+ * Like vanilla WALRead, but if requested position is before available_lsn or
+ * WAL segment doesn't exist on disk, it tries to fetch needed segment from the
+ * advanced safekeeper.
+ *
+ * Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
+ * fetched from timeline 'tli'.
+ *
+ * Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
+ * occurs, in which case 'err' has the desciption. Error always closes remote
+ * connection, if there was any, so socket subscription should be removed.
+ *
+ * NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
+ * NeonWALReaderSocket and call NeonWALRead again with exactly the same
+ * arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
+ * docs during connection establishment (before first successful read) socket
+ * underneath might change.
+ *
+ * Also, eventually walreader should switch from remote to local read; caller
+ * should remove subscription to socket then by checking NeonWALReaderEvents
+ * after successful read (otherwise next read might reopen the connection with
+ * different socket).
+ *
+ * Reading not monotonically is not supported and will result in error.
+ *
+ * Caller should be sure that WAL up to requested LSN exists, otherwise
+ * NEON_WALREAD_WOULDBLOCK might be always returned.
+ */
+NeonWALReadResult
+NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	/*
+	 * If requested data is before known available basebackup lsn or there is
+	 * already active remote state, do remote read.
+	 */
+	if (startptr < state->available_lsn || state->rem_state != RS_NONE)
+	{
+		return NeonWALReadRemote(state, buf, startptr, count, tli);
+	}
+	if (NeonWALReadLocal(state, buf, startptr, count, tli))
+	{
+		return NEON_WALREAD_SUCCESS;
+	}
+	else if (state->wre_errno == ENOENT)
+	{
+		nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
+				LSN_FORMAT_ARGS(startptr));
+		return NeonWALReadRemote(state, buf, startptr, count, tli);
+	}
+	else
+	{
+		return NEON_WALREAD_ERROR;
+	}
+}
+
+/* Do the read from remote safekeeper. */
+static NeonWALReadResult
+NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	if (state->rem_state == RS_NONE)
+	{
+		XLogRecPtr	donor_lsn;
+
+		/* no connection yet; start one */
+		Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
+
+		if (donor == NULL)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "failed to establish remote connection to fetch WAL: no donor available");
+			return NEON_WALREAD_ERROR;
+		}
+		snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
+		nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
+				state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
+		state->wp_conn = libpqwp_connect_start(donor->conninfo);
+		if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "failed to connect to %s to fetch WAL: immediately failed with %s",
+					 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+		/* we'll poll immediately */
+		state->rem_state = RS_CONNECTING_READ;
+	}
+
+	if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
+	{
+		switch (PQconnectPoll(state->wp_conn->pg_conn))
+		{
+			case PGRES_POLLING_FAILED:
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "failed to connect to %s to fetch WAL: poll error: %s",
+						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+			case PGRES_POLLING_READING:
+				state->rem_state = RS_CONNECTING_READ;
+				return NEON_WALREAD_WOULDBLOCK;
+			case PGRES_POLLING_WRITING:
+				state->rem_state = RS_CONNECTING_WRITE;
+				return NEON_WALREAD_WOULDBLOCK;
+			case PGRES_POLLING_OK:
+				{
+					/* connection successfully established */
+					char		start_repl_query[128];
+
+					snprintf(start_repl_query, sizeof(start_repl_query),
+							 "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
+							 LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
+					nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
+							state->donor_name, start_repl_query);
+					if (!libpqwp_send_query(state->wp_conn, start_repl_query))
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "failed to send %s query to %s: %s",
+								 start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+						NeonWALReaderResetRemote(state);
+						return NEON_WALREAD_ERROR;
+					}
+					state->rem_state = RS_WAIT_EXEC_RESULT;
+					break;
+				}
+
+			default:			/* there is unused PGRES_POLLING_ACTIVE */
+				Assert(false);
+				return NEON_WALREAD_ERROR;	/* keep the compiler quiet */
+		}
+	}
+
+	if (state->rem_state == RS_WAIT_EXEC_RESULT)
+	{
+		switch (libpqwp_get_query_result(state->wp_conn))
+		{
+			case WP_EXEC_SUCCESS_COPYBOTH:
+				state->rem_state = RS_ESTABLISHED;
+				break;
+			case WP_EXEC_NEEDS_INPUT:
+				return NEON_WALREAD_WOULDBLOCK;
+			case WP_EXEC_FAILED:
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "get START_REPLICATION result from %s failed: %s",
+						 state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+			default:			/* can't happen */
+				snprintf(state->err_msg, sizeof(state->err_msg),
+						 "get START_REPLICATION result from %s: unexpected result",
+						 state->donor_name);
+				NeonWALReaderResetRemote(state);
+				return NEON_WALREAD_ERROR;
+		}
+	}
+
+	Assert(state->rem_state == RS_ESTABLISHED);
+
+	/*
+	 * If we had the request before, verify args are the same and advance the
+	 * result ptr according to the progress; otherwise register the request.
+	 */
+	if (state->req_lsn != InvalidXLogRecPtr)
+	{
+		if (state->req_lsn != startptr || state->req_len != count)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "args changed during request, was %X/%X %zu, now %X/%X %zu",
+					 LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+		nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
+				LSN_FORMAT_ARGS(startptr),
+				count,
+				state->req_progress);
+		buf += state->req_progress;
+	}
+	else
+	{
+		state->req_lsn = startptr;
+		state->req_len = count;
+		state->req_progress = 0;
+		nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
+				LSN_FORMAT_ARGS(startptr),
+				count);
+	}
+
+	while (true)
+	{
+		Size		to_copy;
+
+		/*
+		 * If we have no ready data, receive new message.
+		 */
+		if (state->wal_rem_len == 0 &&
+
+		/*
+		 * check for the sake of 0 length reads; walproposer does these for
+		 * heartbeats, though generally they shouldn't hit remote source.
+		 */
+			state->req_len - state->req_progress > 0)
+		{
+			NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
+
+			if (read_msg_res != NEON_WALREAD_SUCCESS)
+				return read_msg_res;
+		}
+
+		if (state->req_lsn + state->req_progress != state->rem_lsn)
+		{
+			snprintf(state->err_msg, sizeof(state->err_msg),
+					 "expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
+					 LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
+					 LSN_FORMAT_ARGS(state->rem_lsn),
+					 LSN_FORMAT_ARGS(state->req_lsn),
+					 state->req_len);
+			NeonWALReaderResetRemote(state);
+			return NEON_WALREAD_ERROR;
+		}
+
+		/* We can copy min of (available, requested) bytes. */
+		to_copy =
+			Min(state->req_len - state->req_progress, state->wal_rem_len);
+		memcpy(buf, state->wal_ptr, to_copy);
+		state->wal_ptr += to_copy;
+		state->wal_rem_len -= to_copy;
+		state->rem_lsn += to_copy;
+		if (state->wal_rem_len == 0)
+			state->wal_ptr = NULL;	/* freed by libpqwalproposer */
+		buf += to_copy;
+		state->req_progress += to_copy;
+		if (state->req_progress == state->req_len)
+		{
+			XLogSegNo	next_segno;
+			XLogSegNo	req_segno;
+
+			XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
+			XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
+
+			/*
+			 * Request completed. If there is a chance of serving next one
+			 * locally, close the connection.
+			 */
+			if (state->req_lsn < state->available_lsn &&
+				state->rem_lsn >= state->available_lsn)
+			{
+				nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
+						LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
+				NeonWALReaderResetRemote(state);
+			}
+			else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
+					 is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
+			{
+				nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
+						LSN_FORMAT_ARGS(state->rem_lsn));
+				NeonWALReaderResetRemote(state);
+			}
+			state->req_lsn = InvalidXLogRecPtr;
+			state->req_len = 0;
+			state->req_progress = 0;
+			return NEON_WALREAD_SUCCESS;
+		}
+	}
+}
+
+/*
+ * Read one WAL message from the stream, sets state->wal_ptr in case of success.
+ * Resets remote state in case of failure.
+ */
+static NeonWALReadResult
+NeonWALReaderReadMsg(NeonWALReader *state)
+{
+	while (true)				/* loop until we get 'w' */
+	{
+		char	   *copydata_ptr;
+		int			copydata_size;
+		StringInfoData s;
+		char		msg_type;
+		int			hdrlen;
+
+		Assert(state->rem_state == RS_ESTABLISHED);
+		Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
+
+		switch (libpqwp_async_read(state->wp_conn,
+								   &copydata_ptr,
+								   &copydata_size))
+		{
+			case PG_ASYNC_READ_SUCCESS:
+				break;
+			case PG_ASYNC_READ_TRY_AGAIN:
+				return NEON_WALREAD_WOULDBLOCK;
+			case PG_ASYNC_READ_FAIL:
+				snprintf(state->err_msg,
+						 sizeof(state->err_msg),
+						 "req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
+						 LSN_FORMAT_ARGS(state->req_lsn),
+						 state->req_len,
+						 state->req_progress,
+						 PQerrorMessage(state->wp_conn->pg_conn));
+				goto err;
+		}
+
+		/* put data on StringInfo to parse */
+		s.data = copydata_ptr;
+		s.len = copydata_size;
+		s.cursor = 0;
+		s.maxlen = -1;
+
+		if (copydata_size == 0)
+		{
+			snprintf(state->err_msg,
+					 sizeof(state->err_msg),
+					 "zero length copydata received");
+			goto err;
+		}
+		msg_type = pq_getmsgbyte(&s);
+		switch (msg_type)
+		{
+			case 'w':
+				{
+					XLogRecPtr	start_lsn;
+
+					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
+					if (s.len - s.cursor < hdrlen)
+					{
+						snprintf(state->err_msg,
+								 sizeof(state->err_msg),
+								 "invalid WAL message received from primary");
+						goto err;
+					}
+
+					start_lsn = pq_getmsgint64(&s);
+					pq_getmsgint64(&s); /* XLogRecPtr	end_lsn; */
+					pq_getmsgint64(&s); /* TimestampTz send_time */
+
+					state->rem_lsn = start_lsn;
+					state->wal_rem_len = (Size) (s.len - s.cursor);
+					state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
+					nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
+							LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
+
+					return NEON_WALREAD_SUCCESS;
+				}
+			case 'k':
+				{
+					XLogRecPtr	end_lsn;
+					bool		reply_requested;
+
+					hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
+					if (s.len - s.cursor < hdrlen)
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "invalid keepalive message received from primary");
+						goto err;
+					}
+
+					end_lsn = pq_getmsgint64(&s);
+					pq_getmsgint64(&s); /* TimestampTz timestamp; */
+					reply_requested = pq_getmsgbyte(&s);
+					nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
+							LSN_FORMAT_ARGS(end_lsn),
+							reply_requested);
+					if (end_lsn < state->req_lsn + state->req_len)
+					{
+						snprintf(state->err_msg, sizeof(state->err_msg),
+								 "closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
+								 LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
+						goto err;
+					}
+					continue;
+				}
+			default:
+				nwr_log(WARNING, "invalid replication message type %d", msg_type);
+				continue;
+		}
+	}
+err:
+	NeonWALReaderResetRemote(state);
+	return NEON_WALREAD_ERROR;
+}
+
+/* reset remote connection and request in progress */
+static void
+NeonWALReaderResetRemote(NeonWALReader *state)
+{
+	state->req_lsn = InvalidXLogRecPtr;
+	state->req_len = 0;
+	state->req_progress = 0;
+	state->rem_state = RS_NONE;
+	if (state->wp_conn)
+	{
+		libpqwp_disconnect(state->wp_conn);
+		state->wp_conn = NULL;
+	}
+	state->donor_name[0] = '\0';
+	state->wal_ptr = NULL;
+	state->wal_rem_len = 0;
+	state->rem_lsn = InvalidXLogRecPtr;
+}
+
+/*
+ * Return socket of connection to remote source. Must be called only when
+ * connection exists (NeonWALReaderEvents returns non zero).
+ */
+pgsocket
+NeonWALReaderSocket(NeonWALReader *state)
+{
+	if (!state->wp_conn)
+		nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
+	return PQsocket(state->wp_conn->pg_conn);
+}
+
+/*
+ * Whether remote connection is established. Once this is done, until successful
+ * local read or error socket is stable and user can update socket events
+ * instead of readding it each time.
+ */
+bool
+NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
+{
+	return state->rem_state == RS_ESTABLISHED;
+}
+
+/*
+ * Returns events user should wait on connection socket or 0 if remote
+ * connection is not active.
+ */
+extern uint32
+NeonWALReaderEvents(NeonWALReader *state)
+{
+	switch (state->rem_state)
+	{
+		case RS_NONE:
+			return 0;
+		case RS_CONNECTING_READ:
+			return WL_SOCKET_READABLE;
+		case RS_CONNECTING_WRITE:
+			return WL_SOCKET_WRITEABLE;
+		case RS_WAIT_EXEC_RESULT:
+		case RS_ESTABLISHED:
+			return WL_SOCKET_READABLE;
+		default:
+			Assert(false);
+			return 0;			/* make compiler happy */
+	}
+}
+
+static bool
+NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
+{
+	char	   *p;
+	XLogRecPtr	recptr;
+	Size		nbytes;
+
+	p = buf;
+	recptr = startptr;
+	nbytes = count;
+
+	while (nbytes > 0)
+	{
+		uint32		startoff;
+		int			segbytes;
+		int			readbytes;
+
+		startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
+
+		/*
+		 * If the data we want is not in a segment we have open, close what we
+		 * have (if anything) and open the next one, using the caller's
+		 * provided openSegment callback.
+		 */
+		if (state->seg.ws_file < 0 ||
+			!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
+			tli != state->seg.ws_tli)
+		{
+			XLogSegNo	nextSegNo;
+
+			neon_wal_segment_close(state);
+
+			XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
+			if (!neon_wal_segment_open(state, nextSegNo, &tli))
+			{
+				char		fname[MAXFNAMELEN];
+
+				state->wre_errno = errno;
+
+				XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
+				snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
+						 fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
+				return false;
+			}
+
+			/* This shouldn't happen -- indicates a bug in segment_open */
+			Assert(state->seg.ws_file >= 0);
+
+			/* Update the current segment info. */
+			state->seg.ws_tli = tli;
+			state->seg.ws_segno = nextSegNo;
+		}
+
+		/* How many bytes are within this segment? */
+		if (nbytes > (state->segcxt.ws_segsize - startoff))
+			segbytes = state->segcxt.ws_segsize - startoff;
+		else
+			segbytes = nbytes;
+
+#ifndef FRONTEND
+		pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+#endif
+
+		/* Reset errno first; eases reporting non-errno-affecting errors */
+		errno = 0;
+		readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
+
+#ifndef FRONTEND
+		pgstat_report_wait_end();
+#endif
+
+		if (readbytes <= 0)
+		{
+			char		fname[MAXFNAMELEN];
+
+			XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
+
+			if (readbytes < 0)
+			{
+				state->wre_errno = errno;
+				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
+						 fname, startoff, strerror(state->wre_errno));
+			}
+			else
+			{
+				snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
+						 fname, startoff);
+			}
+			return false;
+		}
+
+		/* Update state for read */
+		recptr += readbytes;
+		nbytes -= readbytes;
+		p += readbytes;
+	}
+
+	return true;
+}
+
+/*
+ * Copy of vanilla wal_segment_open, but returns false in case of error instead
+ * of ERROR, with errno set.
+ *
+ * XLogReaderRoutine->segment_open callback for local pg_wal files
+ */
+static bool
+neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
+					  TimeLineID *tli_p)
+{
+	TimeLineID	tli = *tli_p;
+	char		path[MAXPGPATH];
+
+	XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
+	nwr_log(DEBUG5, "opening %s", path);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return true;
+
+	return false;
+}
+
+static bool
+is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
+{
+	struct stat stat_buffer;
+	char		path[MAXPGPATH];
+
+	XLogFilePath(path, tli, segno, segsize);
+	return stat(path, &stat_buffer) == 0;
+}
+
+/* copy of vanilla wal_segment_close with NeonWALReader */
+static void
+neon_wal_segment_close(NeonWALReader *state)
+{
+	if (state->seg.ws_file >= 0)
+	{
+		close(state->seg.ws_file);
+		/* need to check errno? */
+		state->seg.ws_file = -1;
+	}
+}
+
+char *
+NeonWALReaderErrMsg(NeonWALReader *state)
+{
+	return state->err_msg;
+}
diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h
new file mode 100644
index 0000000000..6be9f149aa
--- /dev/null
+++ b/pgxn/neon/neon_walreader.h
@@ -0,0 +1,30 @@
+#ifndef __NEON_WALREADER_H__
+#define __NEON_WALREADER_H__
+
+#include "access/xlogdefs.h"
+
+/* forward declare so we don't have to expose the struct to the public */
+struct NeonWALReader;
+typedef struct NeonWALReader NeonWALReader;
+
+/* avoid including walproposer.h as it includes us */
+struct WalProposer;
+typedef struct WalProposer WalProposer;
+
+/* NeonWALRead return value */
+typedef enum
+{
+	NEON_WALREAD_SUCCESS,
+	NEON_WALREAD_WOULDBLOCK,
+	NEON_WALREAD_ERROR,
+} NeonWALReadResult;
+
+extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
+extern void NeonWALReaderFree(NeonWALReader *state);
+extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
+extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
+extern uint32 NeonWALReaderEvents(NeonWALReader *state);
+extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state);
+extern char *NeonWALReaderErrMsg(NeonWALReader *state);
+
+#endif							/* __NEON_WALREADER_H__ */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index ecfadb01d6..3fcaab0bee 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -13,19 +13,16 @@
 #ifndef pageserver_h
 #define pageserver_h
 
-#include "postgres.h"
 #include "neon_pgversioncompat.h"
 
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
-#include "storage/block.h"
-#include "storage/smgr.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
+#include "storage/block.h"
+#include "storage/smgr.h"
 #include "utils/memutils.h"
 
-#include "pg_config.h"
-
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -158,11 +155,8 @@ extern page_server_api *page_server;
 extern char *page_server_connstring;
 extern int	flush_every_n_requests;
 extern int	readahead_buffer_size;
-extern bool seqscan_prefetch_enabled;
-extern int	seqscan_prefetch_distance;
 extern char *neon_timeline;
 extern char *neon_tenant;
-extern bool wal_redo;
 extern int32 max_cluster_size;
 
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 609d80588c..8888cd89c6 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -47,25 +47,26 @@
 
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlogdefs.h"
 #include "access/xloginsert.h"
 #include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
+#include "access/xlogutils.h"
 #include "catalog/pg_class.h"
 #include "common/hashfn.h"
 #include "executor/instrument.h"
-#include "pagestore_client.h"
-#include "postmaster/interrupt.h"
+#include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/interrupt.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
 #include "storage/fsm_internals.h"
-#include "storage/smgr.h"
 #include "storage/md.h"
-#include "pgstat.h"
+#include "storage/smgr.h"
+
+#include "pagestore_client.h"
 
 #if PG_VERSION_NUM >= 150000
-#include "access/xlogutils.h"
 #include "access/xlogrecovery.h"
 #endif
 
@@ -106,6 +107,9 @@ typedef enum
 static SMgrRelation unlogged_build_rel = NULL;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 
+static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
+
 /*
  * Prefetch implementation:
  *
@@ -239,7 +243,7 @@ typedef struct PrefetchState
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;
 
-PrefetchState *MyPState;
+static PrefetchState *MyPState;
 
 #define GetPrfSlot(ring_index) ( \
 	( \
@@ -257,7 +261,7 @@ PrefetchState *MyPState;
 	) \
 )
 
-XLogRecPtr	prefetch_lsn = 0;
+static XLogRecPtr prefetch_lsn = 0;
 
 static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
@@ -1371,6 +1375,9 @@ neon_init(void)
 	MyPState->prf_hash = prfh_create(MyPState->hashctx,
 									 readahead_buffer_size, NULL);
 
+	old_redo_read_buffer_filter = redo_read_buffer_filter;
+	redo_read_buffer_filter = neon_redo_read_buffer_filter;
+
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -2869,7 +2876,7 @@ get_fsm_physical_block(BlockNumber heapblk)
  * contents, where with REDO locking it would wait on block 1 and see
  * block 3 with post-REDO contents only.
  */
-bool
+static bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index fc3332612c..7fb0cab9a0 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -45,7 +45,6 @@
 
 /* Prototypes for private functions */
 static void WalProposerLoop(WalProposer *wp);
-static void HackyRemoveWalProposerEvent(Safekeeper *to_remove);
 static void ShutdownConnection(Safekeeper *sk);
 static void ResetConnection(Safekeeper *sk);
 static long TimeToReconnect(WalProposer *wp, TimestampTz now);
@@ -78,11 +77,11 @@ static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, Safekeeper
 static bool AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_state);
 static bool AsyncFlush(Safekeeper *sk);
 static int	CompareLsn(const void *a, const void *b);
-static char *FormatSafekeeperState(SafekeeperState state);
+static char *FormatSafekeeperState(Safekeeper *sk);
 static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
-static uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 
+
 WalProposer *
 WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 {
@@ -113,6 +112,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
 		wp->safekeeper[wp->n_safekeepers].state = SS_OFFLINE;
+		wp->safekeeper[wp->n_safekeepers].active_state = SS_ACTIVE_SEND;
 		wp->safekeeper[wp->n_safekeepers].wp = wp;
 
 		{
@@ -127,8 +127,6 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		}
 
 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
-		wp->api.wal_reader_allocate(&wp->safekeeper[wp->n_safekeepers]);
-		wp->safekeeper[wp->n_safekeepers].flushWrite = false;
 		wp->safekeeper[wp->n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
 		wp->safekeeper[wp->n_safekeepers].streamingAt = InvalidXLogRecPtr;
 		wp->n_safekeepers += 1;
@@ -277,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
 											   wp->config->safekeeper_connection_timeout))
 				{
 					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
+								sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -305,58 +303,20 @@ WalProposerLoop(WalProposer *wp)
 		WalProposerPoll(wp);
 }
 
-/*
- * Hack: provides a way to remove the event corresponding to an individual walproposer from the set.
- *
- * Note: Internally, this completely reconstructs the event set. It should be avoided if possible.
- */
-static void
-HackyRemoveWalProposerEvent(Safekeeper *to_remove)
-{
-	WalProposer *wp = to_remove->wp;
-
-	/* Remove the existing event set, assign sk->eventPos = -1 */
-	wp->api.free_event_set(wp);
-	/* Re-initialize it without adding any safekeeper events */
-	wp->api.init_event_set(wp);
-
-	/*
-	 * loop through the existing safekeepers. If they aren't the one we're
-	 * removing, and if they have a socket we can use, re-add the applicable
-	 * events.
-	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
-	{
-		uint32		desired_events = WL_NO_EVENTS;
-		Safekeeper *sk = &wp->safekeeper[i];
-
-		if (sk == to_remove)
-			continue;
-
-		/* If this safekeeper isn't offline, add an event for it! */
-		if (sk->state != SS_OFFLINE)
-		{
-			desired_events = SafekeeperStateDesiredEvents(sk->state);
-			/* will set sk->eventPos */
-			wp->api.add_safekeeper_event_set(sk, desired_events);
-		}
-	}
-}
 
 /* Shuts down and cleans up the connection for a safekeeper. Sets its state to SS_OFFLINE */
 static void
 ShutdownConnection(Safekeeper *sk)
 {
-	sk->wp->api.conn_finish(sk);
 	sk->state = SS_OFFLINE;
-	sk->flushWrite = false;
 	sk->streamingAt = InvalidXLogRecPtr;
 
 	if (sk->voteResponse.termHistory.entries)
 		pfree(sk->voteResponse.termHistory.entries);
 	sk->voteResponse.termHistory.entries = NULL;
 
-	HackyRemoveWalProposerEvent(sk);
+	sk->wp->api.conn_finish(sk);
+	sk->wp->api.rm_safekeeper_event_set(sk);
 }
 
 /*
@@ -474,7 +434,9 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
+#ifdef WALPROPOSER_LIB			/* walprop_log needs wp in lib build */
 	WalProposer *wp = sk->wp;
+#endif
 
 	/*
 	 * Sanity check. We assume further down that the operations don't block
@@ -527,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_VOTING:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -556,7 +518,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 */
 		case SS_IDLE:
 			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk->state));
+						sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -622,7 +584,7 @@ HandleConnectionEvent(Safekeeper *sk)
 	 * Because PQconnectPoll can change the socket, we have to un-register the
 	 * old event and re-register an event on the new socket.
 	 */
-	HackyRemoveWalProposerEvent(sk);
+	wp->api.rm_safekeeper_event_set(sk);
 	wp->api.add_safekeeper_event_set(sk, new_events);
 
 	/* If we successfully connected, send START_WAL_PUSH query */
@@ -847,7 +809,7 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 	else if (wp->n_votes > wp->quorum)
 	{
-		/* recovery already performed, just start streaming */
+		/* already elected, start streaming */
 		SendProposerElected(sk);
 	}
 	else
@@ -873,21 +835,16 @@ HandleElectedProposer(WalProposer *wp)
 	DetermineEpochStartLsn(wp);
 
 	/*
-	 * Check if not all safekeepers are up-to-date, we need to download WAL
-	 * needed to synchronize them
+	 * Synchronously download WAL from the most advanced safekeeper. We do
+	 * that only for logical replication (and switching logical walsenders to
+	 * neon_walreader is a todo.)
 	 */
-	if (wp->truncateLsn < wp->propEpochStartLsn)
+	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
 	{
-		walprop_log(LOG,
-					"start recovery because truncateLsn=%X/%X is not "
-					"equal to epochStartLsn=%X/%X",
-					LSN_FORMAT_ARGS(wp->truncateLsn),
-					LSN_FORMAT_ARGS(wp->propEpochStartLsn));
-		/* Perform recovery */
-		if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
-			walprop_log(FATAL, "Failed to recover state");
+		walprop_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
-	else if (wp->config->syncSafekeepers)
+
+	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
 	{
 		/* Sync is not needed: just exit */
 		wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
@@ -1085,13 +1042,6 @@ DetermineEpochStartLsn(WalProposer *wp)
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
 	}
-
-	/*
-	 * WalProposer has just elected itself and initialized history, so we can
-	 * call election callback. Usually it updates truncateLsn to fetch WAL for
-	 * logical replication.
-	 */
-	wp->api.after_election(wp);
 }
 
 /*
@@ -1112,6 +1062,9 @@ SendProposerElected(Safekeeper *sk)
 	term_t		lastCommonTerm;
 	int			i;
 
+	/* Now that we are ready to send it's a good moment to create WAL reader */
+	wp->api.wal_reader_allocate(sk);
+
 	/*
 	 * Determine start LSN by comparing safekeeper's log term switch history
 	 * and proposer's, searching for the divergence point.
@@ -1231,6 +1184,7 @@ StartStreaming(Safekeeper *sk)
 	 * once for a connection.
 	 */
 	sk->state = SS_ACTIVE;
+	sk->active_state = SS_ACTIVE_SEND;
 	sk->streamingAt = sk->startStreamingAt;
 
 	/* event set will be updated inside SendMessageToNode */
@@ -1289,9 +1243,13 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 {
 	WalProposer *wp = sk->wp;
 
-	uint32		newEvents = WL_SOCKET_READABLE;
-
-	if (events & WL_SOCKET_WRITEABLE)
+	/*
+	 * Note: we don't known which socket awoke us (sk or nwr). However, as
+	 * SendAppendRequests always tries to send at least one msg in
+	 * SS_ACTIVE_SEND be careful not to go there if are only after sk
+	 * response, otherwise it'd create busy loop of pings.
+	 */
+	if (events & WL_SOCKET_WRITEABLE || sk->active_state == SS_ACTIVE_READ_WAL)
 		if (!SendAppendRequests(sk))
 			return;
 
@@ -1299,28 +1257,29 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 		if (!RecvAppendResponses(sk))
 			return;
 
-	/*
-	 * We should wait for WL_SOCKET_WRITEABLE event if we have unflushed data
-	 * in the buffer.
-	 *
-	 * LSN comparison checks if we have pending unsent messages. This check
-	 * isn't necessary now, because we always send append messages immediately
-	 * after arrival. But it's good to have it here in case we change this
-	 * behavior in the future.
-	 */
-	if (sk->streamingAt != wp->availableLsn || sk->flushWrite)
-		newEvents |= WL_SOCKET_WRITEABLE;
+#if PG_VERSION_NUM >= 150000
+	/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
+	if (events & WL_SOCKET_CLOSED)
+	{
+		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
+					sk->host, sk->port);
+		ShutdownConnection(sk);
+		return;
+	}
+#endif
 
-	wp->api.update_event_set(sk, newEvents);
+	/* configures event set for yield whatever is the substate */
+	wp->api.active_state_update_event_set(sk);
 }
 
 /*
  * Send WAL messages starting from sk->streamingAt until the end or non-writable
- * socket, whichever comes first. Caller should take care of updating event set.
- * Even if no unsent WAL is available, at least one empty message will be sent
- * as a heartbeat, if socket is ready.
+ * socket or neon_walreader blocks, whichever comes first; active_state is
+ * updated accordingly. Caller should take care of updating event set. Even if
+ * no unsent WAL is available, at least one empty message will be sent as a
+ * heartbeat, if socket is ready.
  *
- * Can change state if Async* functions encounter errors and reset connection.
+ * Resets state and kills the connections if any error on them is encountered.
  * Returns false in this case, true otherwise.
  */
 static bool
@@ -1328,11 +1287,11 @@ SendAppendRequests(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
 	XLogRecPtr	endLsn;
-	AppendRequestHeader *req;
 	PGAsyncWriteResult writeResult;
 	bool		sentAnything = false;
+	AppendRequestHeader *req;
 
-	if (sk->flushWrite)
+	if (sk->active_state == SS_ACTIVE_FLUSH)
 	{
 		if (!AsyncFlush(sk))
 
@@ -1343,76 +1302,101 @@ SendAppendRequests(Safekeeper *sk)
 			return sk->state == SS_ACTIVE;
 
 		/* Event set will be updated in the end of HandleActiveState */
-		sk->flushWrite = false;
+		sk->active_state = SS_ACTIVE_SEND;
 	}
 
 	while (sk->streamingAt != wp->availableLsn || !sentAnything)
 	{
-		sentAnything = true;
-
-		endLsn = sk->streamingAt;
-		endLsn += MAX_SEND_SIZE;
-
-		/* if we went beyond available WAL, back off */
-		if (endLsn > wp->availableLsn)
+		if (sk->active_state == SS_ACTIVE_SEND)
 		{
-			endLsn = wp->availableLsn;
+			sentAnything = true;
+
+			endLsn = sk->streamingAt;
+			endLsn += MAX_SEND_SIZE;
+
+			/* if we went beyond available WAL, back off */
+			if (endLsn > wp->availableLsn)
+			{
+				endLsn = wp->availableLsn;
+			}
+
+			req = &sk->appendRequest;
+			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
+
+			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+						req->endLsn - req->beginLsn,
+						LSN_FORMAT_ARGS(req->beginLsn),
+						LSN_FORMAT_ARGS(req->endLsn),
+						LSN_FORMAT_ARGS(req->commitLsn),
+						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+
+			resetStringInfo(&sk->outbuf);
+
+			/* write AppendRequest header */
+			appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
+			enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
+			sk->active_state = SS_ACTIVE_READ_WAL;
 		}
 
-		req = &sk->appendRequest;
-		PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
-
-		walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-					req->endLsn - req->beginLsn,
-					LSN_FORMAT_ARGS(req->beginLsn),
-					LSN_FORMAT_ARGS(req->endLsn),
-					LSN_FORMAT_ARGS(req->commitLsn),
-					LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
-
-		resetStringInfo(&sk->outbuf);
-
-		/* write AppendRequest header */
-		appendBinaryStringInfo(&sk->outbuf, (char *) req, sizeof(AppendRequestHeader));
-
-		/* write the WAL itself */
-		enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
-		/* wal_read will raise error on failure */
-		wp->api.wal_read(sk,
-						 &sk->outbuf.data[sk->outbuf.len],
-						 req->beginLsn,
-						 req->endLsn - req->beginLsn);
-		sk->outbuf.len += req->endLsn - req->beginLsn;
-
-		writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
-
-		/* Mark current message as sent, whatever the result is */
-		sk->streamingAt = endLsn;
-
-		switch (writeResult)
+		if (sk->active_state == SS_ACTIVE_READ_WAL)
 		{
-			case PG_ASYNC_WRITE_SUCCESS:
-				/* Continue writing the next message */
-				break;
+			char	   *errmsg;
 
-			case PG_ASYNC_WRITE_TRY_FLUSH:
+			req = &sk->appendRequest;
 
-				/*
-				 * * We still need to call PQflush some more to finish the
-				 * job. Caller function will handle this by setting right
-				 * event* set.
-				 */
-				sk->flushWrite = true;
-				return true;
+			switch (wp->api.wal_read(sk,
+									 &sk->outbuf.data[sk->outbuf.len],
+									 req->beginLsn,
+									 req->endLsn - req->beginLsn,
+									 &errmsg))
+			{
+				case NEON_WALREAD_SUCCESS:
+					break;
+				case NEON_WALREAD_WOULDBLOCK:
+					return true;
+				case NEON_WALREAD_ERROR:
+					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
+								sk->host, sk->port, errmsg);
+					ShutdownConnection(sk);
+					return false;
+				default:
+					Assert(false);
+			}
 
-			case PG_ASYNC_WRITE_FAIL:
-				walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-							sk->host, sk->port, FormatSafekeeperState(sk->state),
-							wp->api.conn_error_message(sk));
-				ShutdownConnection(sk);
-				return false;
-			default:
-				Assert(false);
-				return false;
+			sk->outbuf.len += req->endLsn - req->beginLsn;
+
+			writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
+
+			/* Mark current message as sent, whatever the result is */
+			sk->streamingAt = req->endLsn;
+
+			switch (writeResult)
+			{
+				case PG_ASYNC_WRITE_SUCCESS:
+					/* Continue writing the next message */
+					sk->active_state = SS_ACTIVE_SEND;
+					break;
+
+				case PG_ASYNC_WRITE_TRY_FLUSH:
+
+					/*
+					 * We still need to call PQflush some more to finish the
+					 * job. Caller function will handle this by setting right
+					 * event set.
+					 */
+					sk->active_state = SS_ACTIVE_FLUSH;
+					return true;
+
+				case PG_ASYNC_WRITE_FAIL:
+					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+								sk->host, sk->port, FormatSafekeeperState(sk),
+								wp->api.conn_error_message(sk));
+					ShutdownConnection(sk);
+					return false;
+				default:
+					Assert(false);
+					return false;
+			}
 		}
 	}
 
@@ -1422,7 +1406,7 @@ SendAppendRequests(Safekeeper *sk)
 /*
  * Receive and process all available feedback.
  *
- * Can change state if Async* functions encounter errors and reset connection.
+ * Resets state and kills the connection if any error on it is encountered.
  * Returns false in this case, true otherwise.
  *
  * NB: This function can call SendMessageToNode and produce new messages.
@@ -1608,39 +1592,77 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
 	return responses[wp->n_safekeepers - wp->quorum];
 }
 
+/*
+ * Return safekeeper with active connection from which WAL can be downloaded, or
+ * none if it doesn't exist. donor_lsn is set to end position of the donor to
+ * the best of our knowledge.
+ */
+Safekeeper *
+GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
+{
+	*donor_lsn = InvalidXLogRecPtr;
+	Safekeeper *donor = NULL;
+	int			i;
+
+	if (wp->n_votes < wp->quorum)
+	{
+		walprop_log(WARNING, "GetDonor called before elections are won");
+		return NULL;
+	}
+
+	/*
+	 * First, consider node which had determined our term start LSN as we know
+	 * about its position immediately after election before any feedbacks are
+	 * sent.
+	 */
+	if (wp->safekeeper[wp->donor].state >= SS_IDLE)
+	{
+		donor = &wp->safekeeper[wp->donor];
+		*donor_lsn = wp->propEpochStartLsn;
+	}
+
+	/*
+	 * But also check feedbacks from all nodes with live connections and take
+	 * the highest one. Note: if node sends feedbacks it already processed
+	 * elected message so its term is fine.
+	 */
+	for (i = 0; i < wp->n_safekeepers; i++)
+	{
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn)
+		{
+			donor = sk;
+			*donor_lsn = sk->appendResponse.flushLsn;
+		}
+	}
+	return donor;
+}
+
 static void
 HandleSafekeeperResponse(WalProposer *wp)
 {
 	XLogRecPtr	minQuorumLsn;
-	XLogRecPtr	minFlushLsn;
+	XLogRecPtr	candidateTruncateLsn;
 
 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
 	wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
 
 	/*
-	 * Try to advance truncateLsn to minFlushLsn, which is the last record
-	 * flushed to all safekeepers. We must always start streaming from the
-	 * beginning of the record, which simplifies decoding on the far end.
+	 * Try to advance truncateLsn -- the last record flushed to all
+	 * safekeepers.
 	 *
-	 * Advanced truncateLsn should be not further than nearest commitLsn. This
-	 * prevents surprising violation of truncateLsn <= commitLsn invariant
-	 * which might occur because 1) truncateLsn can be advanced immediately
-	 * once chunk is broadcast to all safekeepers, and commitLsn generally
-	 * can't be advanced based on feedback from safekeeper who is still in the
-	 * previous epoch (similar to 'leader can't commit entries from previous
-	 * term' in Raft); 2) chunks we read from WAL and send are plain sheets of
-	 * bytes, but safekeepers ack only on record boundaries.
+	 * Advanced truncateLsn should be not higher than commitLsn. This prevents
+	 * surprising violation of truncateLsn <= commitLsn invariant which might
+	 * occur because commitLsn generally can't be advanced based on feedback
+	 * from safekeeper who is still in the previous epoch (similar to 'leader
+	 * can't commit entries from previous term' in Raft); 2)
 	 */
-	minFlushLsn = CalculateMinFlushLsn(wp);
-	if (minFlushLsn > wp->truncateLsn)
+	candidateTruncateLsn = CalculateMinFlushLsn(wp);
+	candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
+	if (candidateTruncateLsn > wp->truncateLsn)
 	{
-		wp->truncateLsn = minFlushLsn;
-
-		/*
-		 * Advance the replication slot to free up old WAL files. Note that
-		 * slot doesn't exist if we are in syncSafekeepers mode.
-		 */
-		wp->api.confirm_wal_streamed(wp, wp->truncateLsn);
+		wp->truncateLsn = candidateTruncateLsn;
 	}
 
 	/*
@@ -1713,7 +1735,7 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 
 		case PG_ASYNC_READ_FAIL:
 			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk->state),
+						sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
@@ -1753,7 +1775,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	if (tag != anymsg->tag)
 	{
 		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk->state));
+					sk->port, FormatSafekeeperState(sk));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1824,12 +1846,13 @@ static bool
 BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state)
 {
 	WalProposer *wp = sk->wp;
-	uint32		events;
+	uint32		sk_events;
+	uint32		nwr_events;
 
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
 		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk->state),
+					sk->host, sk->port, FormatSafekeeperState(sk),
 					wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
@@ -1841,9 +1864,15 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 	 * If the new state will be waiting for events to happen, update the event
 	 * set to wait for those
 	 */
-	events = SafekeeperStateDesiredEvents(success_state);
-	if (events)
-		wp->api.update_event_set(sk, events);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * nwr_events is relevant only during SS_ACTIVE which doesn't use
+	 * BlockingWrite
+	 */
+	Assert(!nwr_events);
+	if (sk_events)
+		wp->api.update_event_set(sk, sk_events);
 
 	return true;
 }
@@ -1876,7 +1905,7 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
 			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						sk->host, sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
@@ -1915,7 +1944,7 @@ AsyncFlush(Safekeeper *sk)
 			return false;
 		case -1:
 			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk->state),
+						sk->host, sk->port, FormatSafekeeperState(sk),
 						wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
@@ -1945,18 +1974,18 @@ CompareLsn(const void *a, const void *b)
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk->state));
+ *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
  */
 static char *
-FormatSafekeeperState(SafekeeperState state)
+FormatSafekeeperState(Safekeeper *sk)
 {
 	char	   *return_val = NULL;
 
-	switch (state)
+	switch (sk->state)
 	{
 		case SS_OFFLINE:
 			return_val = "offline";
@@ -1984,7 +2013,18 @@ FormatSafekeeperState(SafekeeperState state)
 			return_val = "idle";
 			break;
 		case SS_ACTIVE:
-			return_val = "active";
+			switch (sk->active_state)
+			{
+				case SS_ACTIVE_SEND:
+					return_val = "active send";
+					break;
+				case SS_ACTIVE_READ_WAL:
+					return_val = "active read WAL";
+					break;
+				case SS_ACTIVE_FLUSH:
+					return_val = "active flush";
+					break;
+			}
 			break;
 	}
 
@@ -1997,22 +2037,21 @@ FormatSafekeeperState(SafekeeperState state)
 static void
 AssertEventsOkForState(uint32 events, Safekeeper *sk)
 {
-	WalProposer *wp = sk->wp;
-	uint32		expected = SafekeeperStateDesiredEvents(sk->state);
-
-	/*
-	 * The events are in-line with what we're expecting, under two conditions:
-	 * (a) if we aren't expecting anything, `events` has no read- or
-	 * write-ready component. (b) if we are expecting something, there's
-	 * overlap (i.e. `events & expected != 0`)
-	 */
+	uint32		sk_events;
+	uint32		nwr_events;
+	uint32		expected;
 	bool		events_ok_for_state;	/* long name so the `Assert` is more
 										 * clear later */
+	WalProposer *wp = sk->wp;
 
-	if (expected == WL_NO_EVENTS)
-		events_ok_for_state = ((events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) == 0);
-	else
-		events_ok_for_state = ((events & expected) != 0);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * Without one more level of notify target indirection we have no way to
+	 * distinguish which socket woke up us, so just union expected events.
+	 */
+	expected = sk_events | nwr_events;
+	events_ok_for_state = ((events & expected) != 0);
 
 	if (!events_ok_for_state)
 	{
@@ -2021,36 +2060,39 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * and then an assertion that's guaranteed to fail.
 		 */
 		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
+					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
 		Assert(events_ok_for_state);
 	}
 }
 
-/* Returns the set of events a safekeeper in this state should be waiting on
+/* Returns the set of events for both safekeeper (sk_events) and neon_walreader
+ * (nwr_events) sockets a safekeeper in this state should be waiting on.
  *
  * This will return WL_NO_EVENTS (= 0) for some events. */
-static uint32
-SafekeeperStateDesiredEvents(SafekeeperState state)
+void
+SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events)
 {
-	uint32		result = WL_NO_EVENTS;
+	WalProposer *wp = sk->wp;
+
+	*nwr_events = 0;			/* nwr_events is empty for most states */
 
 	/* If the state doesn't have a modifier, we can check the base state */
-	switch (state)
+	switch (sk->state)
 	{
 			/* Connecting states say what they want in the name */
 		case SS_CONNECTING_READ:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 		case SS_CONNECTING_WRITE:
-			result = WL_SOCKET_WRITEABLE;
-			break;
+			*sk_events = WL_SOCKET_WRITEABLE;
+			return;
 
 			/* Reading states need the socket to be read-ready to continue */
 		case SS_WAIT_EXEC_RESULT:
 		case SS_HANDSHAKE_RECV:
 		case SS_WAIT_VERDICT:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 
 			/*
 			 * Idle states use read-readiness as a sign that the connection
@@ -2058,32 +2100,66 @@ SafekeeperStateDesiredEvents(SafekeeperState state)
 			 */
 		case SS_VOTING:
 		case SS_IDLE:
-			result = WL_SOCKET_READABLE;
-			break;
+			*sk_events = WL_SOCKET_READABLE;
+			return;
 
-			/*
-			 * Flush states require write-ready for flushing. Active state
-			 * does both reading and writing.
-			 *
-			 * TODO: SS_ACTIVE sometimes doesn't need to be write-ready. We
-			 * should check sk->flushWrite here to set WL_SOCKET_WRITEABLE.
-			 */
 		case SS_SEND_ELECTED_FLUSH:
+			*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+			return;
+
 		case SS_ACTIVE:
-			result = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
-			break;
+			switch (sk->active_state)
+			{
+					/*
+					 * Everything is sent; we just wait for sk responses and
+					 * latch.
+					 *
+					 * Note: this assumes we send all available WAL to
+					 * safekeeper in one wakeup (unless it blocks). Otherwise
+					 * we would want WL_SOCKET_WRITEABLE here to finish the
+					 * work.
+					 */
+				case SS_ACTIVE_SEND:
+					*sk_events = WL_SOCKET_READABLE;
+					/* c.f. walprop_pg_active_state_update_event_set */
+#if PG_VERSION_NUM >= 150000
+					if (wp->api.wal_reader_events(sk))
+						*nwr_events = WL_SOCKET_CLOSED;
+#endif							/* on PG 14 nwr_events remains 0 */
+					return;
+
+					/*
+					 * Waiting for neon_walreader socket, but we still read
+					 * responses from sk socket.
+					 */
+				case SS_ACTIVE_READ_WAL:
+					*sk_events = WL_SOCKET_READABLE;
+					*nwr_events = wp->api.wal_reader_events(sk);
+					return;
+
+					/*
+					 * Need to flush the sk socket, so ignore neon_walreader
+					 * one and set write interest on sk.
+					 */
+				case SS_ACTIVE_FLUSH:
+					*sk_events = WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE;
+#if PG_VERSION_NUM >= 150000
+					/* c.f. walprop_pg_active_state_update_event_set */
+					if (wp->api.wal_reader_events(sk))
+						*nwr_events = WL_SOCKET_CLOSED;
+#endif							/* on PG 14 nwr_events remains 0 */
+					return;
+			}
+			return;
 
 			/* The offline state expects no events. */
 		case SS_OFFLINE:
-			result = WL_NO_EVENTS;
-			break;
+			*sk_events = 0;
+			return;
 
 		default:
 			Assert(false);
-			break;
 	}
-
-	return result;
 }
 
 /* Returns a human-readable string corresponding to the event set
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 615018c58e..6d478076fe 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,14 +1,15 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__
 
-#include "postgres.h"
-#include "access/xlogdefs.h"
-#include "port.h"
-#include "access/xlog_internal.h"
 #include "access/transam.h"
+#include "access/xlogdefs.h"
+#include "access/xlog_internal.h"
 #include "nodes/replnodes.h"
-#include "utils/uuid.h"
 #include "replication/walreceiver.h"
+#include "utils/uuid.h"
+
+#include "libpqwalproposer.h"
+#include "neon_walreader.h"
 
 #define SK_MAGIC 0xCafeCeefu
 #define SK_PROTOCOL_VERSION 2
@@ -22,43 +23,9 @@
  */
 #define WL_NO_EVENTS 0
 
-struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
+struct WalProposerConn;			/* Defined in libpqwalproposer.h */
 typedef struct WalProposerConn WalProposerConn;
 
-/* Possible return values from ReadPGAsync */
-typedef enum
-{
-	/* The full read was successful. buf now points to the data */
-	PG_ASYNC_READ_SUCCESS,
-
-	/*
-	 * The read is ongoing. Wait until the connection is read-ready, then try
-	 * again.
-	 */
-	PG_ASYNC_READ_TRY_AGAIN,
-	/* Reading failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_READ_FAIL,
-} PGAsyncReadResult;
-
-/* Possible return values from WritePGAsync */
-typedef enum
-{
-	/* The write fully completed */
-	PG_ASYNC_WRITE_SUCCESS,
-
-	/*
-	 * The write started, but you'll need to call PQflush some more times to
-	 * finish it off. We just tried, so it's best to wait until the connection
-	 * is read- or write-ready to try again.
-	 *
-	 * If it becomes read-ready, call PQconsumeInput and flush again. If it
-	 * becomes write-ready, just call PQflush.
-	 */
-	PG_ASYNC_WRITE_TRY_FLUSH,
-	/* Writing failed. Check PQerrorMessage(conn) */
-	PG_ASYNC_WRITE_FAIL,
-} PGAsyncWriteResult;
-
 /*
  * WAL safekeeper state, which is used to wait for some event.
  *
@@ -135,6 +102,40 @@ typedef enum
 	SS_ACTIVE,
 } SafekeeperState;
 
+/*
+ * Sending WAL substates of SS_ACTIVE.
+ */
+typedef enum
+{
+	/*
+	 * We are ready to send more WAL, waiting for latch set to learn about
+	 * more WAL becoming available (or just a timeout to send heartbeat).
+	 */
+	SS_ACTIVE_SEND,
+
+	/*
+	 * Polling neon_walreader to receive chunk of WAL (probably remotely) to
+	 * send to this safekeeper.
+	 *
+	 * Note: socket management is done completely inside walproposer_pg for
+	 * simplicity, and thus simulation doesn't test it. Which is fine as
+	 * simulation is mainly aimed at consensus checks, not waiteventset
+	 * management.
+	 *
+	 * Also, while in this state we don't touch safekeeper socket, so in
+	 * theory it might close connection as inactive. This can be addressed if
+	 * needed; however, while fetching WAL we should regularly send it, so the
+	 * problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
+	 * walreader socket), but similarly shouldn't be a problem.
+	 */
+	SS_ACTIVE_READ_WAL,
+
+	/*
+	 * Waiting for write readiness to flush the socket.
+	 */
+	SS_ACTIVE_FLUSH,
+} SafekeeperActiveState;
+
 /* Consensus logical timestamp. */
 typedef uint64 term_t;
 
@@ -343,12 +344,11 @@ typedef struct Safekeeper
 	 */
 	XLogRecPtr	startStreamingAt;
 
-	bool		flushWrite;		/* set to true if we need to call AsyncFlush,*
-								 * to flush pending messages */
 	XLogRecPtr	streamingAt;	/* current streaming position */
 	AppendRequestHeader appendRequest;	/* request for sending to safekeeper */
 
 	SafekeeperState state;		/* safekeeper state machine state */
+	SafekeeperActiveState active_state;
 	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
@@ -369,12 +369,27 @@ typedef struct Safekeeper
 	/*
 	 * WAL reader, allocated for each safekeeper.
 	 */
-	XLogReaderState *xlogreader;
+	NeonWALReader *xlogreader;
 
 	/*
 	 * Position in wait event set. Equal to -1 if no event
 	 */
 	int			eventPos;
+
+	/*
+	 * Neon WAL reader position in wait event set, or -1 if no socket. Note
+	 * that event must be removed not only on error/failure, but also on
+	 * successful *local* read, as next read might again be remote, but with
+	 * different socket.
+	 */
+	int			nwrEventPos;
+
+	/*
+	 * Per libpq docs, during connection establishment socket might change,
+	 * remember here if it is stable to avoid readding to the event set if
+	 * possible. Must be reset whenever nwr event is deleted.
+	 */
+	bool		nwrConnEstablished;
 #endif
 
 
@@ -403,31 +418,6 @@ typedef enum
 	 */
 } WalProposerConnectPollStatusType;
 
-/* Re-exported and modified ExecStatusType */
-typedef enum
-{
-	/* We received a single CopyBoth result */
-	WP_EXEC_SUCCESS_COPYBOTH,
-
-	/*
-	 * Any success result other than a single CopyBoth was received. The
-	 * specifics of the result were already logged, but it may be useful to
-	 * provide an error message indicating which safekeeper messed up.
-	 *
-	 * Do not expect PQerrorMessage to be appropriately set.
-	 */
-	WP_EXEC_UNEXPECTED_SUCCESS,
-
-	/*
-	 * No result available at this time. Wait until read-ready, then call
-	 * again. Internally, this is returned when PQisBusy indicates that
-	 * PQgetResult would block.
-	 */
-	WP_EXEC_NEEDS_INPUT,
-	/* Catch-all failure. Check PQerrorMessage. */
-	WP_EXEC_FAILED,
-} WalProposerExecStatusType;
-
 /* Re-exported ConnStatusType */
 typedef enum
 {
@@ -488,7 +478,7 @@ typedef struct walproposer_api
 	/* Flush buffer to the network, aka PQflush. */
 	int			(*conn_flush) (Safekeeper *sk);
 
-	/* Close the connection, aka PQfinish. */
+	/* Reset sk state: close pq connection, deallocate xlogreader. */
 	void		(*conn_finish) (Safekeeper *sk);
 
 	/*
@@ -505,17 +495,20 @@ typedef struct walproposer_api
 	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
 	bool		(*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
 
-	/* Download WAL from startpos to endpos and make it available locally. */
-	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
-
-	/* Read WAL from disk to buf. */
-	void		(*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
+	/*
+	 * Download WAL before basebackup for logical walsenders from sk, if
+	 * needed
+	 */
+	bool		(*recovery_download) (WalProposer *wp, Safekeeper *sk);
 
 	/* Allocate WAL reader. */
 	void		(*wal_reader_allocate) (Safekeeper *sk);
 
-	/* Deallocate event set. */
-	void		(*free_event_set) (WalProposer *wp);
+	/* Read WAL from disk to buf. */
+	NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg);
+
+	/* Returns events to be awaited on WAL reader, if any. */
+	uint32		(*wal_reader_events) (Safekeeper *sk);
 
 	/* Initialize event set. */
 	void		(*init_event_set) (WalProposer *wp);
@@ -523,9 +516,15 @@ typedef struct walproposer_api
 	/* Update events for an existing safekeeper connection. */
 	void		(*update_event_set) (Safekeeper *sk, uint32 events);
 
+	/* Configure wait event set for yield in SS_ACTIVE. */
+	void		(*active_state_update_event_set) (Safekeeper *sk);
+
 	/* Add a new safekeeper connection to the event set. */
 	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
 
+	/* Remove safekeeper connection from event set */
+	void		(*rm_safekeeper_event_set) (Safekeeper *sk);
+
 	/*
 	 * Wait until some event happens: - timeout is reached - socket event for
 	 * safekeeper connection - new WAL is available
@@ -558,26 +557,12 @@ typedef struct walproposer_api
 	 */
 	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
 
-	/*
-	 * Called on peer_horizon_lsn updates. Used to advance replication slot
-	 * and to free up disk space by deleting unnecessary WAL.
-	 */
-	void		(*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
-
 	/*
 	 * Write a log message to the internal log processor. This is used only
 	 * when walproposer is compiled as a library. Otherwise, all logging is
 	 * handled by elog().
 	 */
 	void		(*log_internal) (WalProposer *wp, int level, const char *line);
-
-	/*
-	 * Called right after the proposer was elected, but before it started
-	 * recovery and sent ProposerElected message to the safekeepers.
-	 *
-	 * Used by logical replication to update truncateLsn.
-	 */
-	void		(*after_election) (WalProposer *wp);
 } walproposer_api;
 
 /*
@@ -711,6 +696,13 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt
 extern void WalProposerPoll(WalProposer *wp);
 extern void WalProposerFree(WalProposer *wp);
 
+/*
+ * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
+ * recreate set from scratch, hence the export.
+ */
+extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
+extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
+
 
 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */
diff --git a/pgxn/neon/walproposer_compat.c b/pgxn/neon/walproposer_compat.c
index 04b519ab15..35d984c52e 100644
--- a/pgxn/neon/walproposer_compat.c
+++ b/pgxn/neon/walproposer_compat.c
@@ -3,11 +3,13 @@
  * This is needed to avoid linking to full postgres server installation. This file
  * is compiled as a part of libwalproposer static library.
  */
+#include "postgres.h"
 
 #include <stdio.h>
-#include "walproposer.h"
-#include "utils/datetime.h"
+
 #include "miscadmin.h"
+#include "utils/datetime.h"
+#include "walproposer.h"
 
 void
 ExceptionalCondition(const char *conditionName,
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 551d56d416..7773aabfab 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -12,6 +12,7 @@
 #include <unistd.h>
 #include <sys/stat.h>
 #include "access/xact.h"
+#include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "access/xlogutils.h"
 #include "access/xloginsert.h"
@@ -43,14 +44,19 @@
 #include "utils/ps_status.h"
 #include "utils/timestamp.h"
 
-#include "neon.h"
-#include "walproposer.h"
 #include "libpq-fe.h"
 
+#include "libpqwalproposer.h"
+#include "neon.h"
+#include "neon_walreader.h"
+#include "walproposer.h"
+
 #define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
 #define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
 								 * message header */
 
+#define MB ((XLogRecPtr)1024 * 1024)
+
 #define WAL_PROPOSER_SLOT_NAME "wal_proposer_slot"
 
 char	   *wal_acceptors_list = "";
@@ -91,6 +97,12 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
 static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
 static void XLogWalPropClose(XLogRecPtr recptr);
 
+static void add_nwr_event_set(Safekeeper *sk, uint32 events);
+static void update_nwr_event_set(Safekeeper *sk, uint32 events);
+static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
+
+static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
+
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -214,7 +226,6 @@ backpressure_lag_impl(void)
 		XLogRecPtr	myFlushLsn = GetFlushRecPtr();
 #endif
 		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
-#define MB ((XLogRecPtr)1024 * 1024)
 
 		elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
 			 LSN_FORMAT_ARGS(myFlushLsn),
@@ -541,14 +552,6 @@ walprop_pg_load_libpqwalreceiver(void)
 		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
 }
 
-/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
-struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received data from walprop_async_read */
-};
-
 /* Helper function */
 static bool
 ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
@@ -586,16 +589,17 @@ walprop_status(Safekeeper *sk)
 	}
 }
 
-static void
-walprop_connect_start(Safekeeper *sk)
+WalProposerConn *
+libpqwp_connect_start(char *conninfo)
 {
+
 	PGconn	   *pg_conn;
+	WalProposerConn *conn;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
 	char	   *password = neon_auth_token;
 
-	Assert(sk->conn == NULL);
 
 	/*
 	 * Connect using the given connection string. If the NEON_AUTH_TOKEN
@@ -614,7 +618,7 @@ walprop_connect_start(Safekeeper *sk)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = sk->conninfo;
+	values[n] = conninfo;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -635,11 +639,20 @@ walprop_connect_start(Safekeeper *sk)
 	 * palloc will exit on failure though, so there's not much we could do if
 	 * it *did* fail.
 	 */
-	sk->conn = palloc(sizeof(WalProposerConn));
-	sk->conn->pg_conn = pg_conn;
-	sk->conn->is_nonblocking = false;	/* connections always start in
-										 * blocking mode */
-	sk->conn->recvbuf = NULL;
+	conn = palloc(sizeof(WalProposerConn));
+	conn->pg_conn = pg_conn;
+	conn->is_nonblocking = false;	/* connections always start in blocking
+									 * mode */
+	conn->recvbuf = NULL;
+	return conn;
+}
+
+static void
+walprop_connect_start(Safekeeper *sk)
+{
+	Assert(sk->conn == NULL);
+	sk->conn = libpqwp_connect_start(sk->conninfo);
+
 }
 
 static WalProposerConnectPollStatusType
@@ -683,26 +696,33 @@ walprop_connect_poll(Safekeeper *sk)
 	return return_val;
 }
 
-static bool
-walprop_send_query(Safekeeper *sk, char *query)
+extern bool
+libpqwp_send_query(WalProposerConn *conn, char *query)
 {
 	/*
 	 * We need to be in blocking mode for sending the query to run without
 	 * requiring a call to PQflush
 	 */
-	if (!ensure_nonblocking_status(sk->conn, false))
+	if (!ensure_nonblocking_status(conn, false))
 		return false;
 
 	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(sk->conn->pg_conn, query))
+	if (!PQsendQuery(conn->pg_conn, query))
 		return false;
 
 	return true;
 }
 
-static WalProposerExecStatusType
-walprop_get_query_result(Safekeeper *sk)
+static bool
+walprop_send_query(Safekeeper *sk, char *query)
 {
+	return libpqwp_send_query(sk->conn, query);
+}
+
+WalProposerExecStatusType
+libpqwp_get_query_result(WalProposerConn *conn)
+{
+
 	PGresult   *result;
 	WalProposerExecStatusType return_val;
 
@@ -710,14 +730,14 @@ walprop_get_query_result(Safekeeper *sk)
 	char	   *unexpected_success = NULL;
 
 	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(sk->conn->pg_conn))
+	if (!PQconsumeInput(conn->pg_conn))
 		return WP_EXEC_FAILED;
 
-	if (PQisBusy(sk->conn->pg_conn))
+	if (PQisBusy(conn->pg_conn))
 		return WP_EXEC_NEEDS_INPUT;
 
 
-	result = PQgetResult(sk->conn->pg_conn);
+	result = PQgetResult(conn->pg_conn);
 
 	/*
 	 * PQgetResult returns NULL only if getting the result was successful &
@@ -778,6 +798,12 @@ walprop_get_query_result(Safekeeper *sk)
 	return return_val;
 }
 
+static WalProposerExecStatusType
+walprop_get_query_result(Safekeeper *sk)
+{
+	return libpqwp_get_query_result(sk->conn);
+}
+
 static pgsocket
 walprop_socket(Safekeeper *sk)
 {
@@ -790,42 +816,31 @@ walprop_flush(Safekeeper *sk)
 	return (PQflush(sk->conn->pg_conn));
 }
 
-static void
-walprop_finish(Safekeeper *sk)
+/* Like libpqrcv_receive. *buf is valid until the next call. */
+PGAsyncReadResult
+libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 {
-	if (!sk->conn)
-		return;
+	int			rawlen;
 
-	if (sk->conn->recvbuf != NULL)
-		PQfreemem(sk->conn->recvbuf);
-	PQfinish(sk->conn->pg_conn);
-	pfree(sk->conn);
-	sk->conn = NULL;
-}
-
-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-static PGAsyncReadResult
-walprop_async_read(Safekeeper *sk, char **buf, int *amount)
-{
-	int			result;
-
-	if (sk->conn->recvbuf != NULL)
+	if (conn->recvbuf != NULL)
 	{
-		PQfreemem(sk->conn->recvbuf);
-		sk->conn->recvbuf = NULL;
+		PQfreemem(conn->recvbuf);
+		conn->recvbuf = NULL;
 	}
 
-	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(sk->conn->pg_conn))
+	/* Try to receive a CopyData message */
+	rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
+	if (rawlen == 0)
 	{
-		*amount = 0;
-		*buf = NULL;
-		return PG_ASYNC_READ_FAIL;
+		/* Try consuming some data. */
+		if (!PQconsumeInput(conn->pg_conn))
+		{
+			*amount = 0;
+			*buf = NULL;
+			return PG_ASYNC_READ_FAIL;
+		}
+		/* Now that we've consumed some input, try again */
+		rawlen = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true);
 	}
 
 	/*
@@ -839,7 +854,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 	 * sometimes be triggered by the server returning an ErrorResponse (which
 	 * also happens to have the effect that the copy is done).
 	 */
-	switch (result = PQgetCopyData(sk->conn->pg_conn, &sk->conn->recvbuf, true))
+	switch (rawlen)
 	{
 		case 0:
 			*amount = 0;
@@ -854,7 +869,7 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 				 * We can check PQgetResult to make sure that the server
 				 * failed; it'll always result in PGRES_FATAL_ERROR
 				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(sk->conn->pg_conn));
+				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
 
 				if (status != PGRES_FATAL_ERROR)
 					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
@@ -874,12 +889,24 @@ walprop_async_read(Safekeeper *sk, char **buf, int *amount)
 			return PG_ASYNC_READ_FAIL;
 		default:
 			/* Positive values indicate the size of the returned result */
-			*amount = result;
-			*buf = sk->conn->recvbuf;
+			*amount = rawlen;
+			*buf = conn->recvbuf;
 			return PG_ASYNC_READ_SUCCESS;
 	}
 }
 
+/*
+ * Receive a message from the safekeeper.
+ *
+ * On success, the data is placed in *buf. It is valid until the next call
+ * to this function.
+ */
+static PGAsyncReadResult
+walprop_async_read(Safekeeper *sk, char **buf, int *amount)
+{
+	return libpqwp_async_read(sk->conn, buf, amount);
+}
+
 static PGAsyncWriteResult
 walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 {
@@ -962,6 +989,33 @@ walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
 	return true;
 }
 
+void
+libpqwp_disconnect(WalProposerConn *conn)
+{
+	if (conn->recvbuf != NULL)
+		PQfreemem(conn->recvbuf);
+	PQfinish(conn->pg_conn);
+	pfree(conn);
+}
+
+static void
+walprop_finish(Safekeeper *sk)
+{
+	if (sk->conn)
+	{
+		libpqwp_disconnect(sk->conn);
+		sk->conn = NULL;
+	}
+
+	/* free xlogreader */
+	if (sk->xlogreader)
+	{
+		NeonWALReaderFree(sk->xlogreader);
+		sk->xlogreader = NULL;
+	}
+	rm_safekeeper_event_set(sk, false);
+}
+
 /*
  * Subscribe for new WAL and stream it in the loop to safekeepers.
  *
@@ -1165,16 +1219,38 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	}
 }
 
-/*
- * Receive WAL from most advanced safekeeper
- */
+/* Download WAL before basebackup for logical walsenders from sk, if needed */
 static bool
-WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos)
+WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 {
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
 	char		conninfo[MAXCONNINFO];
+	TimeLineID	timeline;
+	XLogRecPtr	startpos;
+	XLogRecPtr	endpos;
+	uint64		download_range_mb;
+
+	startpos = GetLogRepRestartLSN(wp);
+	if (startpos == InvalidXLogRecPtr)
+		return true;			/* recovery not needed */
+	endpos = wp->propEpochStartLsn;
+
+	/*
+	 * If we need to download more than a max_slot_wal_keep_size, cap to it to
+	 * avoid risk of exploding pg_wal. Logical replication won't work until
+	 * recreated, but at least compute would start; this also follows
+	 * max_slot_wal_keep_size semantics.
+	 */
+	download_range_mb = (endpos - startpos) / 1024 / 1024;
+	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
+	{
+		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
+		walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
+					LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
+	}
+	timeline = wp->greetRequest.timeline;
 
 	if (!neon_auth_token)
 	{
@@ -1204,7 +1280,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
 		return false;
 	}
 	elog(LOG,
-		 "start recovery from %s:%s starting from %X/%08X till %X/%08X timeline "
+		 "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
 		 "%d",
 		 sk->host, sk->port, (uint32) (startpos >> 32),
 		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
@@ -1400,30 +1476,56 @@ XLogWalPropClose(XLogRecPtr recptr)
 	walpropFile = -1;
 }
 
-static void
-walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
-{
-	WALReadError errinfo;
-
-	if (!WALRead(sk->xlogreader,
-				 buf,
-				 startptr,
-				 count,
-				 walprop_pg_get_timeline_id(),
-				 &errinfo))
-	{
-		WALReadRaiseError(&errinfo);
-	}
-}
-
 static void
 walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
-	sk->xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
+	char		log_prefix[64];
+
+	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
+	Assert(!sk->xlogreader);
+	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
 	if (sk->xlogreader == NULL)
 		elog(FATAL, "Failed to allocate xlog reader");
 }
 
+static NeonWALReadResult
+walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count, char **errmsg)
+{
+	NeonWALReadResult res;
+
+	res = NeonWALRead(sk->xlogreader,
+					  buf,
+					  startptr,
+					  count,
+					  walprop_pg_get_timeline_id());
+
+	if (res == NEON_WALREAD_SUCCESS)
+	{
+		/*
+		 * If we have the socket subscribed, but walreader doesn't need any
+		 * events, it must mean that remote connection just closed hoping to
+		 * do next read locally. Remove the socket then. It is important to do
+		 * as otherwise next read might open another connection and we won't
+		 * be able to distinguish whether we have correct socket added in wait
+		 * event set.
+		 */
+		if (NeonWALReaderEvents(sk->xlogreader) == 0)
+			rm_safekeeper_event_set(sk, false);
+	}
+	else if (res == NEON_WALREAD_ERROR)
+	{
+		*errmsg = NeonWALReaderErrMsg(sk->xlogreader);
+	}
+
+	return res;
+}
+
+static uint32
+walprop_pg_wal_reader_events(Safekeeper *sk)
+{
+	return NeonWALReaderEvents(sk->xlogreader);
+}
+
 static WaitEventSet *waitEvents;
 
 static void
@@ -1438,6 +1540,8 @@ walprop_pg_free_event_set(WalProposer *wp)
 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
 		wp->safekeeper[i].eventPos = -1;
+		wp->safekeeper[i].nwrEventPos = -1;
+		wp->safekeeper[i].nwrConnEstablished = false;
 	}
 }
 
@@ -1447,11 +1551,37 @@ walprop_pg_init_event_set(WalProposer *wp)
 	if (waitEvents)
 		elog(FATAL, "double-initialization of event set");
 
-	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + wp->n_safekeepers);
+	/* for each sk, we have socket plus potentially socket for neon walreader */
+	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
 	AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
 	AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
+
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		wp->safekeeper[i].eventPos = -1;
+		wp->safekeeper[i].nwrEventPos = -1;
+		wp->safekeeper[i].nwrConnEstablished = false;
+	}
+}
+
+/* add safekeeper socket to wait event set */
+static void
+walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
+{
+	Assert(sk->eventPos == -1);
+	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
+}
+
+/* add neon wal reader socket to wait event set */
+static void
+add_nwr_event_set(Safekeeper *sk, uint32 events)
+{
+	Assert(sk->nwrEventPos == -1);
+	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
+	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
+	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
 static void
@@ -1463,10 +1593,144 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
 	ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
 }
 
+/*
+ * Update neon_walreader event.
+ * Can be called when nwr socket doesn't exist, does nothing in this case.
+ */
 static void
-walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
+update_nwr_event_set(Safekeeper *sk, uint32 events)
 {
-	sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
+	/* eventPos = -1 when we don't have an event */
+	if (sk->nwrEventPos != -1)
+		ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
+}
+
+
+static void
+walprop_pg_active_state_update_event_set(Safekeeper *sk)
+{
+	uint32		sk_events;
+	uint32		nwr_events;
+
+	Assert(sk->state == SS_ACTIVE);
+	SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+	/*
+	 * If we need to wait for neon_walreader, ensure we have up to date socket
+	 * in the wait event set.
+	 */
+	if (sk->active_state == SS_ACTIVE_READ_WAL)
+	{
+		/*
+		 * If conn is established and socket is thus stable, update the event
+		 * directly; otherwise re-add it.
+		 */
+		if (sk->nwrConnEstablished)
+		{
+			Assert(sk->nwrEventPos != -1);
+			update_nwr_event_set(sk, nwr_events);
+		}
+		else
+		{
+			rm_safekeeper_event_set(sk, false);
+			add_nwr_event_set(sk, nwr_events);
+		}
+	}
+	else
+	{
+		/*
+		 * Hack: we should always set 0 here, but for random reasons
+		 * WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
+		 * some event. Since there is also no way to remove socket except
+		 * reconstructing the whole set, SafekeeperStateDesiredEvents instead
+		 * gives WL_SOCKET_CLOSED if socket exists. We never expect it to
+		 * trigger.
+		 *
+		 * On PG 14 which doesn't have WL_SOCKET_CLOSED resort to event
+		 * removal.
+		 */
+#if PG_VERSION_NUM >= 150000
+		Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
+		update_nwr_event_set(sk, WL_SOCKET_CLOSED);
+#else							/* pg 14 */
+		rm_safekeeper_event_set(sk, false);
+#endif
+	}
+	walprop_pg_update_event_set(sk, sk_events);
+}
+
+static void
+walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
+{
+	rm_safekeeper_event_set(to_remove, true);
+}
+
+/*
+ * A hacky way to remove single event from the event set. Can be called if event
+ * doesn't exist, does nothing in this case.
+ *
+ * Note: Internally, this completely reconstructs the event set. It should be
+ * avoided if possible.
+ *
+ * If is_sk is true, socket of connection to safekeeper is removed; otherwise
+ * socket of neon_walreader.
+ */
+static void
+rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
+{
+	WalProposer *wp = to_remove->wp;
+
+	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
+		 to_remove->host, to_remove->port, is_sk);
+
+	/*
+	 * Shortpath for exiting if have nothing to do. We never call this
+	 * function with safekeeper socket not existing, but do that with neon
+	 * walreader socket.
+	 */
+	if ((is_sk && to_remove->eventPos == -1) ||
+		(!is_sk && to_remove->nwrEventPos == -1))
+	{
+		return;
+	}
+
+	/* Remove the existing event set, assign sk->eventPos = -1 */
+	walprop_pg_free_event_set(wp);
+
+	/* Re-initialize it without adding any safekeeper events */
+	wp->api.init_event_set(wp);
+
+	/*
+	 * loop through the existing safekeepers. If they aren't the one we're
+	 * removing, and if they have a socket we can use, re-add the applicable
+	 * events.
+	 */
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		Safekeeper *sk = &wp->safekeeper[i];
+
+		/*
+		 * If this safekeeper isn't offline, add events for it, except for the
+		 * event requested to remove.
+		 */
+		if (sk->state != SS_OFFLINE)
+		{
+			uint32		sk_events;
+			uint32		nwr_events;
+
+			SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
+
+			if (sk != to_remove || !is_sk)
+			{
+				/* will set sk->eventPos */
+				wp->api.add_safekeeper_event_set(sk, sk_events);
+			}
+			if ((sk != to_remove || is_sk) && nwr_events)
+			{
+				add_nwr_event_set(sk, nwr_events);
+			}
+		}
+	}
 }
 
 static int
@@ -1482,6 +1746,21 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 #if PG_MAJORVERSION_NUM >= 16
 	if (WalSndCtl != NULL)
 		ConditionVariablePrepareToSleep(&WalSndCtl->wal_flush_cv);
+
+	/*
+	 * Now that we prepared the condvar, check flush ptr again -- it might
+	 * have changed before we subscribed to cv so we missed the wakeup.
+	 *
+	 * Do that only when we're interested in new WAL: without sync-safekeepers
+	 * and if election already passed.
+	 */
+	if (!wp->config->syncSafekeepers && wp->availableLsn != InvalidXLogRecPtr && GetFlushRecPtr(NULL) > wp->availableLsn)
+	{
+		ConditionVariableCancelSleep();
+		ResetLatch(MyLatch);
+		*events = WL_LATCH_SET;
+		return 1;
+	}
 #endif
 
 	/*
@@ -1533,7 +1812,7 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 }
 
 /*
- * Get PageserverFeedback fields from the most advanced safekeeper
+ * Choose most advanced PageserverFeedback and set it to *rf.
  */
 static void
 GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
@@ -1563,8 +1842,6 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
 		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
 		 rf->replytime);
-
-	replication_feedback_set(rf);
 }
 
 /*
@@ -1604,63 +1881,69 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 		hs->catalog_xmin = InvalidFullTransactionId;
 }
 
+/*
+ * Based on commitLsn and safekeeper responses including pageserver feedback,
+ * 1) Propagate cluster size received from ps to ensure the limit.
+ * 2) Propagate pageserver LSN positions to ensure backpressure limits.
+ * 3) Advance walproposer slot to commitLsn (releasing WAL & waking up waiters).
+ * 4) Propagate hot standby feedback.
+ *
+ * None of that is functional in sync-safekeepers.
+ */
 static void
 walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
 {
 	HotStandbyFeedback hsFeedback;
-	XLogRecPtr	diskConsistentLsn;
+	XLogRecPtr	oldDiskConsistentLsn;
 
-	diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
+	if (wp->config->syncSafekeepers)
+		return;
 
-	if (!wp->config->syncSafekeepers)
+	oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
+
+	/* Get PageserverFeedback fields from the most advanced safekeeper */
+	GetLatestNeonFeedback(&quorumFeedback.rf, wp);
+	replication_feedback_set(&quorumFeedback.rf);
+	SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
+
+	if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{
-		/* Get PageserverFeedback fields from the most advanced safekeeper */
-		GetLatestNeonFeedback(&quorumFeedback.rf, wp);
-		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
-	}
-
-	if (commitLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
-	{
-
 		if (commitLsn > quorumFeedback.flushLsn)
 			quorumFeedback.flushLsn = commitLsn;
 
-		/* advance the replication slot */
-		if (!wp->config->syncSafekeepers)
-			ProcessStandbyReply(
-			/* write_lsn -  This is what durably stored in WAL service. */
-								quorumFeedback.flushLsn,
-			/* flush_lsn - This is what durably stored in WAL service. */
-								quorumFeedback.flushLsn,
+		/*
+		 * Advance the replication slot to commitLsn. WAL before it is
+		 * hardened and will be fetched from one of safekeepers by
+		 * neon_walreader if needed.
+		 *
+		 * Also wakes up syncrep waiters.
+		 */
+		ProcessStandbyReply(
+		/* write_lsn -  This is what durably stored in WAL service. */
+							quorumFeedback.flushLsn,
+		/* flush_lsn - This is what durably stored in WAL service. */
+							quorumFeedback.flushLsn,
 
-			/*
-			 * apply_lsn - This is what processed and durably saved at*
-			 * pageserver.
-			 */
-								quorumFeedback.rf.disk_consistent_lsn,
-								walprop_pg_get_current_timestamp(wp), false);
+		/*
+		 * apply_lsn - This is what processed and durably saved at*
+		 * pageserver.
+		 */
+							quorumFeedback.rf.disk_consistent_lsn,
+							walprop_pg_get_current_timestamp(wp), false);
 	}
 
 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
 	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0)
 	{
 		quorumFeedback.hs = hsFeedback;
-		if (!wp->config->syncSafekeepers)
-			ProcessStandbyHSFeedback(hsFeedback.ts,
-									 XidFromFullTransactionId(hsFeedback.xmin),
-									 EpochFromFullTransactionId(hsFeedback.xmin),
-									 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-									 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+		ProcessStandbyHSFeedback(hsFeedback.ts,
+								 XidFromFullTransactionId(hsFeedback.xmin),
+								 EpochFromFullTransactionId(hsFeedback.xmin),
+								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
+								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
 	}
 }
 
-static void
-walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
-{
-	if (MyReplicationSlot)
-		PhysicalConfirmReceivedLocation(lsn);
-}
-
 static XLogRecPtr
 walprop_pg_get_redo_start_lsn(WalProposer *wp)
 {
@@ -1679,15 +1962,15 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
 }
 
-static void
-walprop_pg_after_election(WalProposer *wp)
+static XLogRecPtr
+GetLogRepRestartLSN(WalProposer *wp)
 {
 	FILE	   *f;
-	XLogRecPtr	lrRestartLsn;
+	XLogRecPtr	lrRestartLsn = InvalidXLogRecPtr;
 
 	/* We don't need to do anything in syncSafekeepers mode. */
 	if (wp->config->syncSafekeepers)
-		return;
+		return InvalidXLogRecPtr;
 
 	/*
 	 * If there are active logical replication subscription we need to provide
@@ -1695,22 +1978,40 @@ walprop_pg_after_election(WalProposer *wp)
 	 * replication slots.
 	 */
 	f = fopen("restart.lsn", "rb");
-	if (f != NULL && !wp->config->syncSafekeepers)
+	if (f != NULL)
 	{
-		fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+		size_t		rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
+
 		fclose(f);
-		if (lrRestartLsn != InvalidXLogRecPtr)
+		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
 		{
-			elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+			uint64		download_range_mb;
+
+			elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+
+			/*
+			 * If we need to download more than a max_slot_wal_keep_size,
+			 * don't do it to avoid risk of exploding pg_wal. Logical
+			 * replication won't work until recreated, but at least compute
+			 * would start; this also follows max_slot_wal_keep_size
+			 * semantics.
+			 */
+			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
+			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
+			{
+				walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
+							LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
+				return InvalidXLogRecPtr;
+			}
 
 			/*
 			 * start from the beginning of the segment to fetch page headers
 			 * verifed by XLogReader
 			 */
 			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-			wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
 		}
 	}
+	return lrRestartLsn;
 }
 
 static const walproposer_api walprop_pg = {
@@ -1730,18 +2031,18 @@ static const walproposer_api walprop_pg = {
 	.conn_async_write = walprop_async_write,
 	.conn_blocking_write = walprop_blocking_write,
 	.recovery_download = WalProposerRecovery,
-	.wal_read = walprop_pg_wal_read,
 	.wal_reader_allocate = walprop_pg_wal_reader_allocate,
-	.free_event_set = walprop_pg_free_event_set,
+	.wal_read = walprop_pg_wal_read,
+	.wal_reader_events = walprop_pg_wal_reader_events,
 	.init_event_set = walprop_pg_init_event_set,
 	.update_event_set = walprop_pg_update_event_set,
+	.active_state_update_event_set = walprop_pg_active_state_update_event_set,
 	.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
+	.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
 	.wait_event_set = walprop_pg_wait_event_set,
 	.strong_random = walprop_pg_strong_random,
 	.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
 	.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
 	.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
-	.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
 	.log_internal = walprop_pg_log_internal,
-	.after_election = walprop_pg_after_election,
 };
diff --git a/poetry.lock b/poetry.lock
index 8583a71f85..76dfd6d37d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2092,51 +2092,61 @@ files = [
 
 [[package]]
 name = "pyyaml"
-version = "6.0"
+version = "6.0.1"
 description = "YAML parser and emitter for Python"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
-    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
-    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
-    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
-    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
-    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
-    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
-    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
-    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
-    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
-    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
-    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
-    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
-    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
-    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
-    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
-    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
-    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
-    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
-    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
-    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
-    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
-    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
-    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
+    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
+    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
+    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
+    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
+    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
+    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
+    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
+    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
+    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
+    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
+    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
+    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
+    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
+    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
 ]
 
 [[package]]
@@ -2553,85 +2563,101 @@ files = [
 
 [[package]]
 name = "yarl"
-version = "1.8.2"
+version = "1.9.4"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
-    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
-    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
-    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
-    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
-    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
-    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
-    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
-    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
-    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
-    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
-    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
-    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
-    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
-    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
-    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
-    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
-    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
-    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
-    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
-    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
-    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
-    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
-    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
-    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
-    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
-    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a8c1df72eb746f4136fe9a2e72b0c9dc1da1cbd23b5372f94b5820ff8ae30e0e"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a3a6ed1d525bfb91b3fc9b690c5a21bb52de28c018530ad85093cc488bee2dd2"},
+    {file = "yarl-1.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c38c9ddb6103ceae4e4498f9c08fac9b590c5c71b0370f98714768e22ac6fa66"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d9e09c9d74f4566e905a0b8fa668c58109f7624db96a2171f21747abc7524234"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8477c1ee4bd47c57d49621a062121c3023609f7a13b8a46953eb6c9716ca392"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5ff2c858f5f6a42c2a8e751100f237c5e869cbde669a724f2062d4c4ef93551"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:357495293086c5b6d34ca9616a43d329317feab7917518bc97a08f9e55648455"},
+    {file = "yarl-1.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54525ae423d7b7a8ee81ba189f131054defdb122cde31ff17477951464c1691c"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:801e9264d19643548651b9db361ce3287176671fb0117f96b5ac0ee1c3530d53"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e516dc8baf7b380e6c1c26792610230f37147bb754d6426462ab115a02944385"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7d5aaac37d19b2904bb9dfe12cdb08c8443e7ba7d2852894ad448d4b8f442863"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:54beabb809ffcacbd9d28ac57b0db46e42a6e341a030293fb3185c409e626b8b"},
+    {file = "yarl-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bac8d525a8dbc2a1507ec731d2867025d11ceadcb4dd421423a5d42c56818541"},
+    {file = "yarl-1.9.4-cp310-cp310-win32.whl", hash = "sha256:7855426dfbddac81896b6e533ebefc0af2f132d4a47340cee6d22cac7190022d"},
+    {file = "yarl-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:848cd2a1df56ddbffeb375535fb62c9d1645dde33ca4d51341378b3f5954429b"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:35a2b9396879ce32754bd457d31a51ff0a9d426fd9e0e3c33394bf4b9036b099"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c7d56b293cc071e82532f70adcbd8b61909eec973ae9d2d1f9b233f3d943f2c"},
+    {file = "yarl-1.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d8a1c6c0be645c745a081c192e747c5de06e944a0d21245f4cf7c05e457c36e0"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b3c1ffe10069f655ea2d731808e76e0f452fc6c749bea04781daf18e6039525"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:549d19c84c55d11687ddbd47eeb348a89df9cb30e1993f1b128f4685cd0ebbf8"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7409f968456111140c1c95301cadf071bd30a81cbd7ab829169fb9e3d72eae9"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e23a6d84d9d1738dbc6e38167776107e63307dfc8ad108e580548d1f2c587f42"},
+    {file = "yarl-1.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8b889777de69897406c9fb0b76cdf2fd0f31267861ae7501d93003d55f54fbe"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:03caa9507d3d3c83bca08650678e25364e1843b484f19986a527630ca376ecce"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e9035df8d0880b2f1c7f5031f33f69e071dfe72ee9310cfc76f7b605958ceb9"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:c0ec0ed476f77db9fb29bca17f0a8fcc7bc97ad4c6c1d8959c507decb22e8572"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:ee04010f26d5102399bd17f8df8bc38dc7ccd7701dc77f4a68c5b8d733406958"},
+    {file = "yarl-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49a180c2e0743d5d6e0b4d1a9e5f633c62eca3f8a86ba5dd3c471060e352ca98"},
+    {file = "yarl-1.9.4-cp311-cp311-win32.whl", hash = "sha256:81eb57278deb6098a5b62e88ad8281b2ba09f2f1147c4767522353eaa6260b31"},
+    {file = "yarl-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d1d2532b340b692880261c15aee4dc94dd22ca5d61b9db9a8a361953d36410b1"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0d2454f0aef65ea81037759be5ca9947539667eecebca092733b2eb43c965a81"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:44d8ffbb9c06e5a7f529f38f53eda23e50d1ed33c6c869e01481d3fafa6b8142"},
+    {file = "yarl-1.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aaaea1e536f98754a6e5c56091baa1b6ce2f2700cc4a00b0d49eca8dea471074"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3777ce5536d17989c91696db1d459574e9a9bd37660ea7ee4d3344579bb6f129"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fc5fc1eeb029757349ad26bbc5880557389a03fa6ada41703db5e068881e5f2"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ea65804b5dc88dacd4a40279af0cdadcfe74b3e5b4c897aa0d81cf86927fee78"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa102d6d280a5455ad6a0f9e6d769989638718e938a6a0a2ff3f4a7ff8c62cc4"},
+    {file = "yarl-1.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09efe4615ada057ba2d30df871d2f668af661e971dfeedf0c159927d48bbeff0"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:008d3e808d03ef28542372d01057fd09168419cdc8f848efe2804f894ae03e51"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6f5cb257bc2ec58f437da2b37a8cd48f666db96d47b8a3115c29f316313654ff"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:992f18e0ea248ee03b5a6e8b3b4738850ae7dbb172cc41c966462801cbf62cf7"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:0e9d124c191d5b881060a9e5060627694c3bdd1fe24c5eecc8d5d7d0eb6faabc"},
+    {file = "yarl-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3986b6f41ad22988e53d5778f91855dc0399b043fc8946d4f2e68af22ee9ff10"},
+    {file = "yarl-1.9.4-cp312-cp312-win32.whl", hash = "sha256:4b21516d181cd77ebd06ce160ef8cc2a5e9ad35fb1c5930882baff5ac865eee7"},
+    {file = "yarl-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a9bd00dc3bc395a662900f33f74feb3e757429e545d831eef5bb280252631984"},
+    {file = "yarl-1.9.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:63b20738b5aac74e239622d2fe30df4fca4942a86e31bf47a81a0e94c14df94f"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d7f7de27b8944f1fee2c26a88b4dabc2409d2fea7a9ed3df79b67277644e17"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c74018551e31269d56fab81a728f683667e7c28c04e807ba08f8c9e3bba32f14"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca06675212f94e7a610e85ca36948bb8fc023e458dd6c63ef71abfd482481aa5"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aef935237d60a51a62b86249839b51345f47564208c6ee615ed2a40878dccdd"},
+    {file = "yarl-1.9.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b134fd795e2322b7684155b7855cc99409d10b2e408056db2b93b51a52accc7"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d25039a474c4c72a5ad4b52495056f843a7ff07b632c1b92ea9043a3d9950f6e"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f7d6b36dd2e029b6bcb8a13cf19664c7b8e19ab3a58e0fefbb5b8461447ed5ec"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:957b4774373cf6f709359e5c8c4a0af9f6d7875db657adb0feaf8d6cb3c3964c"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d7eeb6d22331e2fd42fce928a81c697c9ee2d51400bd1a28803965883e13cead"},
+    {file = "yarl-1.9.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6a962e04b8f91f8c4e5917e518d17958e3bdee71fd1d8b88cdce74dd0ebbf434"},
+    {file = "yarl-1.9.4-cp37-cp37m-win32.whl", hash = "sha256:f3bc6af6e2b8f92eced34ef6a96ffb248e863af20ef4fde9448cc8c9b858b749"},
+    {file = "yarl-1.9.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4d7a90a92e528aadf4965d685c17dacff3df282db1121136c382dc0b6014d2"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ec61d826d80fc293ed46c9dd26995921e3a82146feacd952ef0757236fc137be"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8be9e837ea9113676e5754b43b940b50cce76d9ed7d2461df1af39a8ee674d9f"},
+    {file = "yarl-1.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bef596fdaa8f26e3d66af846bbe77057237cb6e8efff8cd7cc8dff9a62278bbf"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d47552b6e52c3319fede1b60b3de120fe83bde9b7bddad11a69fb0af7db32f1"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84fc30f71689d7fc9168b92788abc977dc8cefa806909565fc2951d02f6b7d57"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4aa9741085f635934f3a2583e16fcf62ba835719a8b2b28fb2917bb0537c1dfa"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:206a55215e6d05dbc6c98ce598a59e6fbd0c493e2de4ea6cc2f4934d5a18d130"},
+    {file = "yarl-1.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07574b007ee20e5c375a8fe4a0789fad26db905f9813be0f9fef5a68080de559"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5a2e2433eb9344a163aced6a5f6c9222c0786e5a9e9cac2c89f0b28433f56e23"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6ad6d10ed9b67a382b45f29ea028f92d25bc0bc1daf6c5b801b90b5aa70fb9ec"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:6fe79f998a4052d79e1c30eeb7d6c1c1056ad33300f682465e1b4e9b5a188b78"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a825ec844298c791fd28ed14ed1bffc56a98d15b8c58a20e0e08c1f5f2bea1be"},
+    {file = "yarl-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8619d6915b3b0b34420cf9b2bb6d81ef59d984cb0fde7544e9ece32b4b3043c3"},
+    {file = "yarl-1.9.4-cp38-cp38-win32.whl", hash = "sha256:686a0c2f85f83463272ddffd4deb5e591c98aac1897d65e92319f729c320eece"},
+    {file = "yarl-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:a00862fb23195b6b8322f7d781b0dc1d82cb3bcac346d1e38689370cc1cc398b"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:604f31d97fa493083ea21bd9b92c419012531c4e17ea6da0f65cacdcf5d0bd27"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8a854227cf581330ffa2c4824d96e52ee621dd571078a252c25e3a3b3d94a1b1"},
+    {file = "yarl-1.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ba6f52cbc7809cd8d74604cce9c14868306ae4aa0282016b641c661f981a6e91"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6327976c7c2f4ee6816eff196e25385ccc02cb81427952414a64811037bbc8b"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8397a3817d7dcdd14bb266283cd1d6fc7264a48c186b986f32e86d86d35fbac5"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e0381b4ce23ff92f8170080c97678040fc5b08da85e9e292292aba67fdac6c34"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23d32a2594cb5d565d358a92e151315d1b2268bc10f4610d098f96b147370136"},
+    {file = "yarl-1.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb2a5c08a4eaaba605340fdee8fc08e406c56617566d9643ad8bf6852778fc7"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26a1dc6285e03f3cc9e839a2da83bcbf31dcb0d004c72d0730e755b33466c30e"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:18580f672e44ce1238b82f7fb87d727c4a131f3a9d33a5e0e82b793362bf18b4"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:29e0f83f37610f173eb7e7b5562dd71467993495e568e708d99e9d1944f561ec"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:1f23e4fe1e8794f74b6027d7cf19dc25f8b63af1483d91d595d4a07eca1fb26c"},
+    {file = "yarl-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:db8e58b9d79200c76956cefd14d5c90af54416ff5353c5bfd7cbe58818e26ef0"},
+    {file = "yarl-1.9.4-cp39-cp39-win32.whl", hash = "sha256:c7224cab95645c7ab53791022ae77a4509472613e839dab722a72abe5a684575"},
+    {file = "yarl-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:824d6c50492add5da9374875ce72db7a0733b29c2394890aef23d533106e2b15"},
+    {file = "yarl-1.9.4-py3-none-any.whl", hash = "sha256:928cecb0ef9d5a7946eb6ff58417ad2fe9375762382f1bf5c55e61645f2c43ad"},
+    {file = "yarl-1.9.4.tar.gz", hash = "sha256:566db86717cf8080b99b58b083b773a908ae40f06681e87e589a976faf8246bf"},
 ]
 
 [package.dependencies]
diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index eadb9abd43..64ef108e11 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -87,6 +87,10 @@ impl AuthError {
     pub fn too_many_connections() -> Self {
         AuthErrorImpl::TooManyConnections.into()
     }
+
+    pub fn is_auth_failed(&self) -> bool {
+        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
+    }
 }
 
 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 3b09e05bd2..923bd02560 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,7 +9,6 @@ use tokio_postgres::config::AuthKeys;
 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::console::errors::GetAuthInfoError;
-use crate::console::provider::AuthInfo;
 use crate::console::AuthSecret;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
@@ -187,24 +186,52 @@ async fn auth_quirks(
     };
 
     info!("fetching user's authentication info");
-    // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
-    let AuthInfo {
-        secret,
-        allowed_ips,
-    } = api.get_auth_info(extra, &info).await?;
+    let allowed_ips = api.get_allowed_ips(extra, &info).await?;
 
     // check allowed list
     if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
         return Err(auth::AuthError::ip_address_not_allowed());
     }
-    let secret = secret.unwrap_or_else(|| {
+    let cached_secret = api.get_role_secret(extra, &info).await?;
+
+    let secret = cached_secret.clone().unwrap_or_else(|| {
         // If we don't have an authentication secret, we mock one to
         // prevent malicious probing (possible due to missing protocol steps).
         // This mocked secret will never lead to successful authentication.
         info!("authentication info not found, mocking it");
         AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
     });
+    match authenticate_with_secret(
+        secret,
+        info,
+        client,
+        unauthenticated_password,
+        allow_cleartext,
+        config,
+        latency_timer,
+    )
+    .await
+    {
+        Ok(keys) => Ok(keys),
+        Err(e) => {
+            if e.is_auth_failed() {
+                // The password could have been changed, so we invalidate the cache.
+                cached_secret.invalidate();
+            }
+            Err(e)
+        }
+    }
+}
 
+async fn authenticate_with_secret(
+    secret: AuthSecret,
+    info: ComputeUserInfo,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
+    unauthenticated_password: Option<Vec<u8>>,
+    allow_cleartext: bool,
+    config: &'static AuthenticationConfig,
+    latency_timer: &mut LatencyTimer,
+) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
         let keys = match auth_outcome {
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index ae4c42bcb1..5bc2d377a6 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -6,10 +6,12 @@ use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::console::provider::AllowedIpsCache;
 use proxy::console::provider::NodeInfoCache;
+use proxy::console::provider::RoleSecretCache;
 use proxy::http;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
+use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
 use anyhow::bail;
@@ -86,7 +88,7 @@ struct ProxyCliArgs {
     #[clap(long)]
     metric_collection_interval: Option<String>,
     /// cache for `wake_compute` api method (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     wake_compute_cache: String,
     /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
     #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
@@ -94,12 +96,8 @@ struct ProxyCliArgs {
     /// Allow self-signed certificates for compute nodes (for testing)
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     allow_self_signed_compute: bool,
-    /// timeout for http connections
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    sql_over_http_timeout: tokio::time::Duration,
-    /// Whether the SQL over http pool is opt-in
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    sql_over_http_pool_opt_in: bool,
+    #[clap(flatten)]
+    sql_over_http: SqlOverHttpArgs,
     /// timeout for scram authentication protocol
     #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
     scram_protocol_timeout: tokio::time::Duration,
@@ -127,13 +125,46 @@ struct ProxyCliArgs {
     #[clap(flatten)]
     aimd_config: proxy::rate_limiter::AimdConfig,
     /// cache for `allowed_ips` (use `size=0` to disable)
-    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
     allowed_ips_cache: String,
+    /// cache for `role_secret` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
+    role_secret_cache: String,
     /// disable ip check for http requests. If it is too time consuming, it could be turned off.
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     disable_ip_check_for_http: bool,
 }
 
+#[derive(clap::Args, Clone, Copy, Debug)]
+struct SqlOverHttpArgs {
+    /// timeout for http connection requests
+    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
+    sql_over_http_timeout: tokio::time::Duration,
+
+    /// Whether the SQL over http pool is opt-in
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    sql_over_http_pool_opt_in: bool,
+
+    /// How many connections to pool for each endpoint. Excess connections are discarded
+    #[clap(long, default_value_t = 20)]
+    sql_over_http_pool_max_conns_per_endpoint: usize,
+
+    /// How long pooled connections should remain idle for before closing
+    #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)]
+    sql_over_http_idle_timeout: tokio::time::Duration,
+
+    /// Duration each shard will wait on average before a GC sweep.
+    /// A longer time will causes sweeps to take longer but will interfere less frequently.
+    #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)]
+    sql_over_http_pool_gc_epoch: tokio::time::Duration,
+
+    /// How many shards should the global pool have. Must be a power of two.
+    /// More shards will introduce less contention for pool operations, but can
+    /// increase memory used by the pool
+    #[clap(long, default_value_t = 128)]
+    sql_over_http_pool_shards: usize,
+}
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let _logging_guard = proxy::logging::init().await?;
@@ -266,9 +297,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         AuthBackend::Console => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;
+            let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
+            info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}");
             let caches = Box::leak(Box::new(console::caches::ApiCaches {
                 node_info: NodeInfoCache::new(
                     "node_info_cache",
@@ -282,6 +315,12 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                     allowed_ips_cache_config.ttl,
                     false,
                 ),
+                role_secret: RoleSecretCache::new(
+                    "role_secret_cache",
+                    role_secret_cache_config.size,
+                    role_secret_cache_config.ttl,
+                    false,
+                ),
             }));
 
             let config::WakeComputeLockOptions {
@@ -315,8 +354,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         }
     };
     let http_config = HttpConfig {
-        timeout: args.sql_over_http_timeout,
-        pool_opt_in: args.sql_over_http_pool_opt_in,
+        request_timeout: args.sql_over_http.sql_over_http_timeout,
+        pool_options: GlobalConnPoolOptions {
+            max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint,
+            gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch,
+            pool_shards: args.sql_over_http.sql_over_http_pool_shards,
+            idle_timeout: args.sql_over_http.sql_over_http_idle_timeout,
+            opt_in: args.sql_over_http.sql_over_http_pool_opt_in,
+        },
     };
     let authentication_config = AuthenticationConfig {
         scram_protocol_timeout: args.scram_protocol_timeout,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f932df4058..610bf7e424 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,4 +1,4 @@
-use crate::{auth, rate_limiter::RateBucketInfo};
+use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions};
 use anyhow::{bail, ensure, Context, Ok};
 use rustls::{sign, Certificate, PrivateKey};
 use sha2::{Digest, Sha256};
@@ -36,8 +36,8 @@ pub struct TlsConfig {
 }
 
 pub struct HttpConfig {
-    pub timeout: tokio::time::Duration,
-    pub pool_opt_in: bool,
+    pub request_timeout: tokio::time::Duration,
+    pub pool_options: GlobalConnPoolOptions,
 }
 
 pub struct AuthenticationConfig {
@@ -310,10 +310,10 @@ pub struct CacheOptions {
 
 impl CacheOptions {
     /// Default options for [`crate::console::provider::NodeInfoCache`].
-    pub const DEFAULT_OPTIONS_NODE_INFO: &'static str = "size=4000,ttl=4m";
+    pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";
 
     /// Parse cache options passed via cmdline.
-    /// Example: [`Self::DEFAULT_OPTIONS_NODE_INFO`].
+    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
     fn parse(options: &str) -> anyhow::Result<Self> {
         let mut size = None;
         let mut ttl = None;
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 8d399f26ea..e4cf1e8c8e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -10,6 +10,7 @@ use crate::{
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
+use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::{
     sync::{OwnedSemaphorePermit, Semaphore},
@@ -216,6 +217,7 @@ impl ConsoleReqExtra {
 }
 
 /// Auth secret which is managed by the cloud.
+#[derive(Clone)]
 pub enum AuthSecret {
     #[cfg(feature = "testing")]
     /// Md5 hash of user's password.
@@ -250,18 +252,20 @@ pub struct NodeInfo {
 
 pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
 pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
-pub type AllowedIpsCache = TimedLru<Arc<str>, Arc<Vec<String>>>;
+pub type AllowedIpsCache = TimedLru<SmolStr, Arc<Vec<String>>>;
+pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option<AuthSecret>>;
+pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>;
 
 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
 #[async_trait]
 pub trait Api {
     /// Get the client's auth secret for authentication.
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, errors::GetAuthInfoError>;
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
 
     async fn get_allowed_ips(
         &self,
@@ -282,7 +286,9 @@ pub struct ApiCaches {
     /// Cache for the `wake_compute` API method.
     pub node_info: NodeInfoCache,
     /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead.
-    pub allowed_ips: TimedLru<Arc<str>, Arc<Vec<String>>>,
+    pub allowed_ips: AllowedIpsCache,
+    /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead.
+    pub role_secret: RoleSecretCache,
 }
 
 /// Various caches for [`console`](super).
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index c464b4daf2..dba5e5863f 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -6,6 +6,7 @@ use super::{
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
     AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
+use crate::console::provider::CachedRoleSecret;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use async_trait::async_trait;
 use futures::TryFutureExt;
@@ -142,12 +143,14 @@ async fn get_execute_postgres_query(
 #[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         _extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(creds).await
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(creds).await?.secret,
+        ))
     }
 
     async fn get_allowed_ips(
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index f748c9a41f..5bf7b0f986 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -3,14 +3,15 @@
 use super::{
     super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
     errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
+    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra,
+    NodeInfo,
 };
 use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
 use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use itertools::Itertools;
-use std::{net::SocketAddr, sync::Arc};
+use std::sync::Arc;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -140,7 +141,7 @@ impl Api {
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
             let mut config = compute::ConnCfg::new();
-            config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
 
             let node = NodeInfo {
                 config,
@@ -159,12 +160,25 @@ impl Api {
 #[async_trait]
 impl super::Api for Api {
     #[tracing::instrument(skip_all)]
-    async fn get_auth_info(
+    async fn get_role_secret(
         &self,
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
-    ) -> Result<AuthInfo, GetAuthInfoError> {
-        self.do_get_auth_info(extra, creds).await
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
+        if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
+            return Ok(role_secret);
+        }
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let (_, secret) = self
+            .caches
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret.clone());
+        self.caches
+            .allowed_ips
+            .insert(ep, Arc::new(auth_info.allowed_ips));
+        Ok(secret)
     }
 
     async fn get_allowed_ips(
@@ -172,8 +186,7 @@ impl super::Api for Api {
         extra: &ConsoleReqExtra,
         creds: &ComputeUserInfo,
     ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
-        let key: &str = &creds.endpoint;
-        if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
+        if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) {
             ALLOWED_IPS_BY_CACHE_OUTCOME
                 .with_label_values(&["hit"])
                 .inc();
@@ -182,10 +195,14 @@ impl super::Api for Api {
         ALLOWED_IPS_BY_CACHE_OUTCOME
             .with_label_values(&["miss"])
             .inc();
-        let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips);
+        let auth_info = self.do_get_auth_info(extra, creds).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let ep = creds.endpoint.clone();
+        let user = creds.inner.user.clone();
         self.caches
-            .allowed_ips
-            .insert(key.into(), allowed_ips.clone());
+            .role_secret
+            .insert((ep.clone(), user), auth_info.secret);
+        self.caches.allowed_ips.insert(ep, allowed_ips.clone());
         Ok(allowed_ips)
     }
 
@@ -252,9 +269,10 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     Err(ApiError::Console { status, text })
 }
 
-fn parse_host_port(input: &str) -> Option<(String, u16)> {
-    let parsed: SocketAddr = input.parse().ok()?;
-    Some((parsed.ip().to_string(), parsed.port()))
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
+    let (host, port) = input.rsplit_once(':')?;
+    let ipv6_brackets: &[_] = &['[', ']'];
+    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
 }
 
 #[cfg(test)]
@@ -262,9 +280,24 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_parse_host_port() {
+    fn test_parse_host_port_v4() {
         let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
         assert_eq!(host, "127.0.0.1");
         assert_eq!(port, 5432);
     }
+
+    #[test]
+    fn test_parse_host_port_v6() {
+        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
+        assert_eq!(host, "2001:db8::1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_url() {
+        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
+            .expect("failed to parse");
+        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
+        assert_eq!(port, 5432);
+    }
 }
diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs
index e9c65fcef3..bd93fb2b70 100644
--- a/proxy/src/scram/key.rs
+++ b/proxy/src/scram/key.rs
@@ -6,7 +6,7 @@ pub const SCRAM_KEY_LEN: usize = 32;
 /// One of the keys derived from the [password](super::password::SaltedPassword).
 /// We use the same structure for all keys, i.e.
 /// `ClientKey`, `StoredKey`, and `ServerKey`.
-#[derive(Default, PartialEq, Eq)]
+#[derive(Clone, Default, PartialEq, Eq)]
 #[repr(transparent)]
 pub struct ScramKey {
     bytes: [u8; SCRAM_KEY_LEN],
diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs
index 424beccec9..9e74e07af1 100644
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -5,6 +5,7 @@ use super::key::ScramKey;
 
 /// Server secret is produced from [password](super::password::SaltedPassword)
 /// and is used throughout the authentication process.
+#[derive(Clone)]
 pub struct ServerSecret {
     /// Number of iterations for `PBKDF2` function.
     pub iterations: u32,
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index e358a0712f..07825da8dc 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -6,9 +6,13 @@ mod conn_pool;
 mod sql_over_http;
 mod websocket;
 
+pub use conn_pool::GlobalConnPoolOptions;
+
 use anyhow::bail;
 use hyper::StatusCode;
 use metrics::IntCounterPairGuard;
+use rand::rngs::StdRng;
+use rand::SeedableRng;
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio_util::task::TaskTracker;
@@ -47,6 +51,11 @@ pub async fn task_main(
 
     let conn_pool = conn_pool::GlobalConnPool::new(config);
 
+    let conn_pool2 = Arc::clone(&conn_pool);
+    tokio::spawn(async move {
+        conn_pool2.gc_worker(StdRng::from_entropy()).await;
+    });
+
     // shutdown the connection pool
     tokio::spawn({
         let cancellation_token = cancellation_token.clone();
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index ab8903418b..c476560215 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,15 +1,19 @@
 use anyhow::{anyhow, Context};
 use async_trait::async_trait;
 use dashmap::DashMap;
-use futures::future::poll_fn;
+use futures::{future::poll_fn, Future};
+use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard};
+use once_cell::sync::Lazy;
 use parking_lot::RwLock;
 use pbkdf2::{
     password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
     Params, Pbkdf2,
 };
 use pq_proto::StartupMessageParams;
+use prometheus::{exponential_buckets, register_histogram, Histogram};
+use rand::Rng;
 use smol_str::SmolStr;
-use std::{collections::HashMap, net::IpAddr, sync::Arc};
+use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
     task::{ready, Poll},
@@ -18,7 +22,7 @@ use std::{
     ops::Deref,
     sync::atomic::{self, AtomicUsize},
 };
-use tokio::time;
+use tokio::time::{self, Instant};
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
 
 use crate::{
@@ -30,11 +34,10 @@ use crate::{
 };
 use crate::{compute, config};
 
-use tracing::{error, warn, Span};
+use tracing::{debug, error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
 pub const APP_NAME: &str = "/sql_over_http";
-const MAX_CONNS_PER_ENDPOINT: usize = 20;
 
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
@@ -69,6 +72,77 @@ struct ConnPoolEntry {
 pub struct EndpointConnPool {
     pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
     total_conns: usize,
+    max_conns: usize,
+    _guard: IntCounterPairGuard,
+}
+
+impl EndpointConnPool {
+    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        pools
+            .get_mut(&db_user)
+            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
+    }
+
+    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        if let Some(pool) = pools.get_mut(&db_user) {
+            let old_len = pool.conns.len();
+            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
+            let new_len = pool.conns.len();
+            let removed = old_len - new_len;
+            *total_conns -= removed;
+            removed > 0
+        } else {
+            false
+        }
+    }
+
+    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+        let conn_id = client.conn_id;
+
+        if client.inner.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return Ok(());
+        }
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < pool.max_conns {
+                // we create this db-user entry in get, so it should not be None
+                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    pool_entries.conns.push(ConnPoolEntry {
+                        conn: client,
+                        _last_access: std::time::Instant::now(),
+                    });
+
+                    returned = true;
+                    per_db_size = pool_entries.conns.len();
+
+                    pool.total_conns += 1;
+                }
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+
+        Ok(())
+    }
 }
 
 /// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
@@ -87,6 +161,27 @@ pub struct DbUserConnPool {
     password_hash: Option<PasswordHashString>,
 }
 
+impl DbUserConnPool {
+    fn clear_closed_clients(&mut self, conns: &mut usize) {
+        let old_len = self.conns.len();
+
+        self.conns.retain(|conn| !conn.conn.inner.is_closed());
+
+        let new_len = self.conns.len();
+        let removed = old_len - new_len;
+        *conns -= removed;
+    }
+
+    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry> {
+        self.clear_closed_clients(conns);
+        let conn = self.conns.pop();
+        if conn.is_some() {
+            *conns -= 1;
+        }
+        conn
+    }
+}
+
 pub struct GlobalConnPool {
     // endpoint -> per-endpoint connection pool
     //
@@ -94,52 +189,127 @@ pub struct GlobalConnPool {
     // pool as early as possible and release the lock.
     global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
 
+    /// Number of endpoint-connection pools
+    ///
     /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
     /// That seems like far too much effort, so we're using a relaxed increment counter instead.
     /// It's only used for diagnostics.
     global_pool_size: AtomicUsize,
 
+    proxy_config: &'static crate::config::ProxyConfig,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct GlobalConnPoolOptions {
     // Maximum number of connections per one endpoint.
     // Can mix different (dbname, username) connections.
     // When running out of free slots for a particular endpoint,
     // falls back to opening a new connection for each request.
-    max_conns_per_endpoint: usize,
+    pub max_conns_per_endpoint: usize,
 
-    proxy_config: &'static crate::config::ProxyConfig,
+    pub gc_epoch: Duration,
 
-    // Using a lock to remove any race conditions.
-    // Eg cleaning up connections while a new connection is returned
-    closed: RwLock<bool>,
+    pub pool_shards: usize,
+
+    pub idle_timeout: Duration,
+
+    pub opt_in: bool,
 }
 
+pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_http_pool_reclaimation_lag_seconds",
+        "Time it takes to reclaim unused connection pools",
+        // 1us -> 65ms
+        exponential_buckets(1e-6, 2.0, 16).unwrap(),
+    )
+    .unwrap()
+});
+
+pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
+    register_int_counter_pair!(
+        "proxy_http_pool_endpoints_registered_total",
+        "Number of endpoints we have registered pools for",
+        "proxy_http_pool_endpoints_unregistered_total",
+        "Number of endpoints we have unregistered pools for",
+    )
+    .unwrap()
+});
+
 impl GlobalConnPool {
     pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
+        let shards = config.http_config.pool_options.pool_shards;
         Arc::new(Self {
-            global_pool: DashMap::new(),
+            global_pool: DashMap::with_shard_amount(shards),
             global_pool_size: AtomicUsize::new(0),
-            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
             proxy_config: config,
-            closed: RwLock::new(false),
         })
     }
 
     pub fn shutdown(&self) {
-        *self.closed.write() = true;
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
 
-        self.global_pool.retain(|_, endpoint_pool| {
-            let mut pool = endpoint_pool.write();
-            // by clearing this hashmap, we remove the slots that a connection can be returned to.
-            // when returning, it drops the connection if the slot doesn't exist
-            pool.pools.clear();
-            pool.total_conns = 0;
+    pub async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.proxy_config.http_config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
 
-            false
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = GC_LATENCY.start_timer();
+        let current_len = shard.len();
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let EndpointConnPool {
+                    pools, total_conns, ..
+                } = pool.get_mut();
+
+                // ensure that closed clients are removed
+                pools
+                    .iter_mut()
+                    .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns));
+
+                // we only remove this pool if it has no active connections
+                if *total_conns == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
         });
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe_duration();
+
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
     }
 
     pub async fn get(
         self: &Arc<Self>,
-        conn_info: &ConnInfo,
+        conn_info: ConnInfo,
         force_new: bool,
         session_id: uuid::Uuid,
         peer_addr: IpAddr,
@@ -147,15 +317,11 @@ impl GlobalConnPool {
         let mut client: Option<ClientInner> = None;
         let mut latency_timer = LatencyTimer::new("http");
 
-        let pool = if force_new {
-            None
-        } else {
-            Some((conn_info.clone(), self.clone()))
-        };
-
         let mut hash_valid = false;
+        let mut endpoint_pool = Weak::new();
         if !force_new {
             let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+            endpoint_pool = Arc::downgrade(&pool);
             let mut hash = None;
 
             // find a pool entry by (dbname, username) if exists
@@ -180,12 +346,8 @@ impl GlobalConnPool {
                 // we will continue with the regular connection flow
                 if validate.is_ok() {
                     hash_valid = true;
-                    let mut pool = pool.write();
-                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                        if let Some(entry) = pool_entries.conns.pop() {
-                            client = Some(entry.conn);
-                            pool.total_conns -= 1;
-                        }
+                    if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) {
+                        client = Some(entry.conn)
                     }
                 }
             }
@@ -198,11 +360,12 @@ impl GlobalConnPool {
                 info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
                 connect_to_compute(
                     self.proxy_config,
-                    conn_info,
+                    &conn_info,
                     conn_id,
                     session_id,
                     latency_timer,
                     peer_addr,
+                    endpoint_pool.clone(),
                 )
                 .await
             } else {
@@ -214,18 +377,19 @@ impl GlobalConnPool {
                 );
                 latency_timer.pool_hit();
                 latency_timer.success();
-                return Ok(Client::new(client, pool).await);
+                return Ok(Client::new(client, conn_info, endpoint_pool).await);
             }
         } else {
             let conn_id = uuid::Uuid::new_v4();
             info!(%conn_id, "pool: opening a new connection '{conn_info}'");
             connect_to_compute(
                 self.proxy_config,
-                conn_info,
+                &conn_info,
                 conn_id,
                 session_id,
                 latency_timer,
                 peer_addr,
+                endpoint_pool.clone(),
             )
             .await
         };
@@ -269,59 +433,7 @@ impl GlobalConnPool {
             _ => {}
         }
         let new_client = new_client?;
-        Ok(Client::new(new_client, pool).await)
-    }
-
-    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
-        let conn_id = client.conn_id;
-
-        // We want to hold this open while we return. This ensures that the pool can't close
-        // while we are in the middle of returning the connection.
-        let closed = self.closed.read();
-        if *closed {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
-            return Ok(());
-        }
-
-        if client.inner.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return Ok(());
-        }
-
-        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < self.max_conns_per_endpoint {
-                // we create this db-user entry in get, so it should not be None
-                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    pool_entries.conns.push(ConnPoolEntry {
-                        conn: client,
-                        _last_access: std::time::Instant::now(),
-                    });
-
-                    returned = true;
-                    per_db_size = pool_entries.conns.len();
-
-                    pool.total_conns += 1;
-                }
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-
-        Ok(())
+        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
     }
 
     fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
@@ -334,6 +446,12 @@ impl GlobalConnPool {
         let new_pool = Arc::new(RwLock::new(EndpointConnPool {
             pools: HashMap::new(),
             total_conns: 0,
+            max_conns: self
+                .proxy_config
+                .http_config
+                .pool_options
+                .max_conns_per_endpoint,
+            _guard: ENDPOINT_POOLS.guard(),
         }));
 
         // find or create a pool for this endpoint
@@ -363,9 +481,11 @@ impl GlobalConnPool {
 }
 
 struct TokioMechanism<'a> {
+    pool: Weak<RwLock<EndpointConnPool>>,
     conn_info: &'a ConnInfo,
     session_id: uuid::Uuid,
     conn_id: uuid::Uuid,
+    idle: Duration,
 }
 
 #[async_trait]
@@ -385,6 +505,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
             timeout,
             self.conn_id,
             self.session_id,
+            self.pool.clone(),
+            self.idle,
         )
         .await
     }
@@ -403,6 +525,7 @@ async fn connect_to_compute(
     session_id: uuid::Uuid,
     latency_timer: LatencyTimer,
     peer_addr: IpAddr,
+    pool: Weak<RwLock<EndpointConnPool>>,
 ) -> anyhow::Result<ClientInner> {
     let tls = config.tls_config.as_ref();
     let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -431,7 +554,6 @@ async fn connect_to_compute(
         application_name: APP_NAME.to_string(),
         options: console_options,
     };
-    // TODO(anna): this is a bit hacky way, consider using console notification listener.
     if !config.disable_ip_check_for_http {
         let allowed_ips = backend.get_allowed_ips(&extra).await?;
         if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {
@@ -448,6 +570,8 @@ async fn connect_to_compute(
             conn_id,
             conn_info,
             session_id,
+            pool,
+            idle: config.http_config.pool_options.idle_timeout,
         },
         node_info,
         &extra,
@@ -463,6 +587,8 @@ async fn connect_to_compute_once(
     timeout: time::Duration,
     conn_id: uuid::Uuid,
     mut session: uuid::Uuid,
+    pool: Weak<RwLock<EndpointConnPool>>,
+    idle: Duration,
 ) -> Result<ClientInner, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
 
@@ -491,13 +617,29 @@ async fn connect_to_compute_once(
         branch_id: node_info.aux.branch_id.clone(),
     };
 
+    let db_user = conn_info.db_and_user();
     tokio::spawn(
         async move {
             let _conn_gauge = conn_gauge;
+            let mut idle_timeout = pin!(tokio::time::sleep(idle));
             poll_fn(move |cx| {
                 if matches!(rx.has_changed(), Ok(true)) {
                     session = *rx.borrow_and_update();
                     info!(%session, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+
+                // 5 minute idle connection timeout
+                if idle_timeout.as_mut().poll(cx).is_ready() {
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                    info!("connection idle");
+                    if let Some(pool) = pool.clone().upgrade() {
+                        // remove client from pool - should close the connection if it's idle.
+                        // does nothing if the client is currently checked-out and in-use
+                        if pool.write().remove_client(db_user.clone(), conn_id) {
+                            info!("idle connection removed");
+                        }
+                    }
                 }
 
                 loop {
@@ -515,15 +657,25 @@ async fn connect_to_compute_once(
                         }
                         Some(Err(e)) => {
                             error!(%session, "connection error: {}", e);
-                            return Poll::Ready(())
+                            break
                         }
                         None => {
                             info!("connection closed");
-                            return Poll::Ready(())
+                            break
                         }
                     }
                 }
-            }).await
+
+                // remove from connection pool
+                if let Some(pool) = pool.clone().upgrade() {
+                    if pool.write().remove_client(db_user.clone(), conn_id) {
+                        info!("closed connection removed");
+                    }
+                }
+
+                Poll::Ready(())
+            }).await;
+
         }
         .instrument(span)
     );
@@ -553,23 +705,27 @@ pub struct Client {
     conn_id: uuid::Uuid,
     span: Span,
     inner: Option<ClientInner>,
-    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
+    conn_info: ConnInfo,
+    pool: Weak<RwLock<EndpointConnPool>>,
 }
 
 pub struct Discard<'a> {
     conn_id: uuid::Uuid,
-    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<RwLock<EndpointConnPool>>,
 }
 
 impl Client {
     pub(self) async fn new(
         inner: ClientInner,
-        pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
+        conn_info: ConnInfo,
+        pool: Weak<RwLock<EndpointConnPool>>,
     ) -> Self {
         Self {
             conn_id: inner.conn_id,
             inner: Some(inner),
             span: Span::current(),
+            conn_info,
             pool,
         }
     }
@@ -578,6 +734,7 @@ impl Client {
             inner,
             pool,
             conn_id,
+            conn_info,
             span: _,
         } = self;
         (
@@ -587,6 +744,7 @@ impl Client {
                 .inner,
             Discard {
                 pool,
+                conn_info,
                 conn_id: *conn_id,
             },
         )
@@ -602,14 +760,14 @@ impl Client {
 
 impl Discard<'_> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        if status != ReadyForQueryStatus::Idle {
-            if let Some((conn_info, _)) = self.pool.take() {
-                info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
-            }
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
         }
     }
     pub fn discard(&mut self) {
-        if let Some((conn_info, _)) = self.pool.take() {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
             info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
         }
     }
@@ -629,16 +787,17 @@ impl Deref for Client {
 
 impl Drop for Client {
     fn drop(&mut self) {
+        let conn_info = self.conn_info.clone();
         let client = self
             .inner
             .take()
             .expect("client inner should not be removed");
-        if let Some((conn_info, conn_pool)) = self.pool.take() {
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
             let current_span = self.span.clone();
             // return connection to the pool
             tokio::task::spawn_blocking(move || {
                 let _span = current_span.enter();
-                let _ = conn_pool.put(&conn_info, client);
+                let _ = EndpointConnPool::put(&conn_pool, &conn_info, client);
             });
         }
     }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 307b085ce0..2e9d8526d3 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -206,7 +206,7 @@ pub async fn handle(
     config: &'static HttpConfig,
 ) -> Result<Response<Body>, ApiError> {
     let result = tokio::time::timeout(
-        config.timeout,
+        config.request_timeout,
         handle_inner(
             config,
             request,
@@ -278,7 +278,7 @@ pub async fn handle(
         Err(_) => {
             let message = format!(
                 "HTTP-Connection timed out, execution time exeeded {} seconds",
-                config.timeout.as_secs()
+                config.request_timeout.as_secs()
             );
             error!(message);
             json_response(
@@ -320,7 +320,8 @@ async fn handle_inner(
 
     // Allow connection pooling only if explicitly requested
     // or if we have decided that http pool is no longer opt-in
-    let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool =
+        !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     // isolation level, read only and deferrable
 
@@ -359,7 +360,7 @@ async fn handle_inner(
     let payload: Payload = serde_json::from_slice(&body)?;
 
     let mut client = conn_pool
-        .get(&conn_info, !allow_pool, session_id, peer_addr)
+        .get(conn_info, !allow_pool, session_id, peer_addr)
         .await?;
 
     let mut response = Response::builder()
diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs
index 2acbb2352b..7b9f96dce3 100644
--- a/s3_scrubber/src/checks.rs
+++ b/s3_scrubber/src/checks.rs
@@ -1,9 +1,12 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 
 use anyhow::Context;
 use aws_sdk_s3::{types::ObjectIdentifier, Client};
+use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
+use pageserver_api::shard::ShardIndex;
 use tracing::{error, info, warn};
 use utils::generation::Generation;
+use utils::id::TimelineId;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
@@ -40,7 +43,7 @@ impl TimelineAnalysis {
 
 pub(crate) fn branch_cleanup_and_check_errors(
     id: &TenantShardTimelineId,
-    s3_root: &RootTarget,
+    tenant_objects: &mut TenantObjectListing,
     s3_active_branch: Option<&BranchData>,
     console_branch: Option<BranchData>,
     s3_data: Option<S3TimelineBlobData>,
@@ -72,8 +75,8 @@ pub(crate) fn branch_cleanup_and_check_errors(
             match s3_data.blob_data {
                 BlobDataParseResult::Parsed {
                     index_part,
-                    index_part_generation,
-                    mut s3_layers,
+                    index_part_generation: _index_part_generation,
+                    s3_layers: _s3_layers,
                 } => {
                     if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
                         result.errors.push(format!(
@@ -111,65 +114,19 @@ pub(crate) fn branch_cleanup_and_check_errors(
                             ))
                         }
 
-                        let layer_map_key = (layer, metadata.generation);
-                        if !s3_layers.remove(&layer_map_key) {
+                        if !tenant_objects.check_ref(id.timeline_id, &layer, &metadata) {
                             // FIXME: this will emit false positives if an index was
                             // uploaded concurrently with our scan.  To make this check
                             // correct, we need to try sending a HEAD request for the
                             // layer we think is missing.
                             result.errors.push(format!(
-                                "index_part.json contains a layer {}{} that is not present in remote storage",
-                                layer_map_key.0.file_name(),
-                                layer_map_key.1.get_suffix()
+                                "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage",
+                                layer.file_name(),
+                                metadata.generation.get_suffix(),
+                                metadata.shard
                             ))
                         }
                     }
-
-                    let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
-                        .into_iter()
-                        .filter(|(_layer_name, gen)|
-                            // A layer is only considered orphaned if it has a generation below
-                            // the index.  If the generation is >= the index, then the layer may
-                            // be an upload from a running pageserver, or even an upload from
-                            // a new generation that didn't upload an index yet.
-                            //
-                            // Even so, a layer that is not referenced by the index could just
-                            // be something enqueued for deletion, so while this check is valid
-                            // for indicating that a layer is garbage, it is not an indicator
-                            // of a problem.
-                            gen < &index_part_generation)
-                        .collect();
-
-                    if !orphan_layers.is_empty() {
-                        // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
-                        // these as a hint that there is something worth cleaning up here.
-                        result.warnings.push(format!(
-                            "index_part.json does not contain layers from S3: {:?}",
-                            orphan_layers
-                                .iter()
-                                .map(|(layer_name, gen)| format!(
-                                    "{}{}",
-                                    layer_name.file_name(),
-                                    gen.get_suffix()
-                                ))
-                                .collect::<Vec<_>>(),
-                        ));
-                        result.garbage_keys.extend(orphan_layers.iter().map(
-                            |(layer_name, layer_gen)| {
-                                let mut key = s3_root.timeline_root(id).prefix_in_bucket;
-                                let delimiter = s3_root.delimiter();
-                                if !key.ends_with(delimiter) {
-                                    key.push_str(delimiter);
-                                }
-                                key.push_str(&format!(
-                                    "{}{}",
-                                    &layer_name.file_name(),
-                                    layer_gen.get_suffix()
-                                ));
-                                key
-                            },
-                        ));
-                    }
                 }
                 BlobDataParseResult::Relic => {}
                 BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -204,6 +161,83 @@ pub(crate) fn branch_cleanup_and_check_errors(
     result
 }
 
+#[derive(Default)]
+pub(crate) struct LayerRef {
+    ref_count: usize,
+}
+
+/// Top-level index of objects in a tenant.  This may be used by any shard-timeline within
+/// the tenant to query whether an object exists.
+#[derive(Default)]
+pub(crate) struct TenantObjectListing {
+    shard_timelines:
+        HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>,
+}
+
+impl TenantObjectListing {
+    /// Having done an S3 listing of the keys within a timeline prefix, merge them into the overall
+    /// list of layer keys for the Tenant.
+    pub(crate) fn push(
+        &mut self,
+        ttid: TenantShardTimelineId,
+        layers: HashSet<(LayerFileName, Generation)>,
+    ) {
+        let shard_index = ShardIndex::new(
+            ttid.tenant_shard_id.shard_number,
+            ttid.tenant_shard_id.shard_count,
+        );
+        let replaced = self.shard_timelines.insert(
+            (shard_index, ttid.timeline_id),
+            layers
+                .into_iter()
+                .map(|l| (l, LayerRef::default()))
+                .collect(),
+        );
+
+        assert!(
+            replaced.is_none(),
+            "Built from an S3 object listing, which should never repeat a key"
+        );
+    }
+
+    /// Having loaded a timeline index, check if a layer referenced by the index exists.  If it does,
+    /// the layer's refcount will be incremented.  Later, after calling this for all references in all indices
+    /// in a tenant, orphan layers may be detected by their zero refcounts.
+    ///
+    /// Returns true if the layer exists
+    pub(crate) fn check_ref(
+        &mut self,
+        timeline_id: TimelineId,
+        layer_file: &LayerFileName,
+        metadata: &IndexLayerMetadata,
+    ) -> bool {
+        let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else {
+            return false;
+        };
+
+        let Some(layer_ref) = shard_tl.get_mut(&(layer_file.clone(), metadata.generation)) else {
+            return false;
+        };
+
+        layer_ref.ref_count += 1;
+
+        true
+    }
+
+    pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> {
+        let mut result = Vec::new();
+        for ((shard_index, timeline_id), layers) in &self.shard_timelines {
+            for ((layer_file, generation), layer_ref) in layers {
+                if layer_ref.ref_count == 0 {
+                    result.push((*shard_index, *timeline_id, layer_file.clone(), *generation))
+                }
+            }
+        }
+
+        result
+    }
+}
+
 #[derive(Debug)]
 pub(crate) struct S3TimelineBlobData {
     pub(crate) blob_data: BlobDataParseResult,
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index d2338c21e5..8fb1346c8e 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -15,6 +15,7 @@ use anyhow::Context;
 use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::sso::SsoCredentialsProvider;
 use aws_config::BehaviorVersion;
 use aws_sdk_s3::config::Region;
@@ -255,6 +256,11 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         let chain = CredentialsProviderChain::first_try(
             "env",
             EnvironmentVariableCredentialsProvider::new(),
+        )
+        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+        .or_else(
+            "profile-sso",
+            ProfileFileCredentialsProvider::builder().build(),
         );
 
         // Use SSO if we were given an account ID
@@ -265,7 +271,7 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
                     .account_id(sso_account)
                     .role_name("PowerUserAccess")
                     .start_url("https://neondb.awsapps.com/start")
-                    .region(Region::from_static("eu-central-1"))
+                    .region(bucket_region.clone())
                     .build(),
             ),
             None => chain,
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index 91347ca21b..bcc4d2e618 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -2,22 +2,25 @@ use std::collections::{HashMap, HashSet};
 
 use crate::checks::{
     branch_cleanup_and_check_errors, list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData,
-    TimelineAnalysis,
+    TenantObjectListing, TimelineAnalysis,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId};
 use aws_sdk_s3::Client;
 use futures_util::{pin_mut, StreamExt, TryStreamExt};
 use histogram::Histogram;
+use pageserver::tenant::remote_timeline_client::remote_layer_path;
 use pageserver::tenant::IndexPart;
+use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
+use utils::id::TenantId;
 
 #[derive(Serialize)]
 pub struct MetadataSummary {
     count: usize,
     with_errors: HashSet<TenantShardTimelineId>,
     with_warnings: HashSet<TenantShardTimelineId>,
-    with_garbage: HashSet<TenantShardTimelineId>,
+    with_orphans: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
 
     layer_count: MinMaxHisto,
@@ -87,7 +90,7 @@ impl MetadataSummary {
             count: 0,
             with_errors: HashSet::new(),
             with_warnings: HashSet::new(),
-            with_garbage: HashSet::new(),
+            with_orphans: HashSet::new(),
             indices_by_version: HashMap::new(),
             layer_count: MinMaxHisto::new(),
             timeline_size_bytes: MinMaxHisto::new(),
@@ -141,6 +144,10 @@ impl MetadataSummary {
         }
     }
 
+    fn notify_timeline_orphan(&mut self, ttid: &TenantShardTimelineId) {
+        self.with_orphans.insert(*ttid);
+    }
+
     /// Long-form output for printing at end of a scan
     pub fn summary_string(&self) -> String {
         let version_summary: String = itertools::join(
@@ -154,7 +161,7 @@ impl MetadataSummary {
             "Timelines: {0}
 With errors: {1}
 With warnings: {2}
-With garbage: {3}
+With orphan layers: {3}
 Index versions: {version_summary}
 Timeline size bytes: {4}
 Layer size bytes: {5}
@@ -163,7 +170,7 @@ Timeline layer count: {6}
             self.count,
             self.with_errors.len(),
             self.with_warnings.len(),
-            self.with_garbage.len(),
+            self.with_orphans.len(),
             self.timeline_size_bytes.oneline(),
             self.layer_size_bytes.oneline(),
             self.layer_count.oneline(),
@@ -191,7 +198,7 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
 
     // Generate a stream of TenantTimelineId
     let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t));
-    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+    let timelines = timelines.try_buffered(CONCURRENCY);
     let timelines = timelines.try_flatten();
 
     // Generate a stream of S3TimelineBlobData
@@ -204,17 +211,118 @@ pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<Metada
         Ok((ttid, data))
     }
     let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid));
-    let timelines = timelines.try_buffer_unordered(CONCURRENCY);
+    let timelines = timelines.try_buffered(CONCURRENCY);
 
+    // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different
+    // shards in the same tenant might refer to one anothers' keys if a shard split has happened.
+
+    let mut tenant_id = None;
+    let mut tenant_objects = TenantObjectListing::default();
+    let mut tenant_timeline_results = Vec::new();
+
+    fn analyze_tenant(
+        tenant_id: TenantId,
+        summary: &mut MetadataSummary,
+        mut tenant_objects: TenantObjectListing,
+        timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
+    ) {
+        let mut timeline_generations = HashMap::new();
+        for (ttid, data) in timelines {
+            // Stash the generation of each timeline, for later use identifying orphan layers
+            if let BlobDataParseResult::Parsed {
+                index_part: _index_part,
+                index_part_generation,
+                s3_layers: _s3_layers,
+            } = &data.blob_data
+            {
+                timeline_generations.insert(ttid, *index_part_generation);
+            }
+
+            // Apply checks to this timeline shard's metadata, and in the process update `tenant_objects`
+            // reference counts for layers across the tenant.
+            let analysis =
+                branch_cleanup_and_check_errors(&ttid, &mut tenant_objects, None, None, Some(data));
+            summary.update_analysis(&ttid, &analysis);
+        }
+
+        // Identifying orphan layers must be done on a tenant-wide basis, because individual
+        // shards' layers may be referenced by other shards.
+        //
+        // Orphan layers are not a corruption, and not an indication of a problem.  They are just
+        // consuming some space in remote storage, and may be cleaned up at leisure.
+        for (shard_index, timeline_id, layer_file, generation) in tenant_objects.get_orphans() {
+            let ttid = TenantShardTimelineId {
+                tenant_shard_id: TenantShardId {
+                    tenant_id,
+                    shard_count: shard_index.shard_count,
+                    shard_number: shard_index.shard_number,
+                },
+                timeline_id,
+            };
+
+            if let Some(timeline_generation) = timeline_generations.get(&ttid) {
+                if &generation >= timeline_generation {
+                    // Candidate orphan layer is in the current or future generation relative
+                    // to the index we read for this timeline shard, so its absence from the index
+                    // doesn't make it an orphan: more likely, it is a case where the layer was
+                    // uploaded, but the index referencing the layer wasn't written yet.
+                    continue;
+                }
+            }
+
+            let orphan_path = remote_layer_path(
+                &tenant_id,
+                &timeline_id,
+                shard_index,
+                &layer_file,
+                generation,
+            );
+
+            tracing::info!("Orphan layer detected: {orphan_path}");
+
+            summary.notify_timeline_orphan(&ttid);
+        }
+    }
+
+    // Iterate through  all the timeline results.  These are in key-order, so
+    // all results for the same tenant will be adjacent.  We accumulate these,
+    // and then call `analyze_tenant` to flush, when we see the next tenant ID.
     let mut summary = MetadataSummary::new();
     pin_mut!(timelines);
     while let Some(i) = timelines.next().await {
         let (ttid, data) = i?;
         summary.update_data(&data);
 
-        let analysis = branch_cleanup_and_check_errors(&ttid, &target, None, None, Some(data));
+        match tenant_id {
+            None => tenant_id = Some(ttid.tenant_shard_id.tenant_id),
+            Some(prev_tenant_id) => {
+                if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
+                    let tenant_objects = std::mem::take(&mut tenant_objects);
+                    let timelines = std::mem::take(&mut tenant_timeline_results);
+                    analyze_tenant(prev_tenant_id, &mut summary, tenant_objects, timelines);
+                    tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                }
+            }
+        }
 
-        summary.update_analysis(&ttid, &analysis);
+        if let BlobDataParseResult::Parsed {
+            index_part: _index_part,
+            index_part_generation: _index_part_generation,
+            s3_layers,
+        } = &data.blob_data
+        {
+            tenant_objects.push(ttid, s3_layers.clone());
+        }
+        tenant_timeline_results.push((ttid, data));
+    }
+
+    if !tenant_timeline_results.is_empty() {
+        analyze_tenant(
+            tenant_id.expect("Must be set if results are present"),
+            &mut summary,
+            tenant_objects,
+            tenant_timeline_results,
+        );
     }
 
     Ok(summary)
diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore
index d9d4d0296a..cdf99aefd7 100644
--- a/scripts/sk_collect_dumps/.gitignore
+++ b/scripts/sk_collect_dumps/.gitignore
@@ -1,2 +1,4 @@
 result
 *.json
+hosts
+poetry.lock
diff --git a/scripts/sk_collect_dumps/ansible.cfg b/scripts/sk_collect_dumps/ansible.cfg
new file mode 100644
index 0000000000..150986ab79
--- /dev/null
+++ b/scripts/sk_collect_dumps/ansible.cfg
@@ -0,0 +1,11 @@
+[defaults]
+host_key_checking = False
+inventory=./hosts
+remote_tmp=/tmp
+remote_user=developer
+callbacks_enabled = profile_tasks
+
+[ssh_connection]
+scp_if_ssh = True
+ssh_args = -F ./ssh.cfg
+pipelining = True
diff --git a/scripts/sk_collect_dumps/pyproject.toml b/scripts/sk_collect_dumps/pyproject.toml
new file mode 100644
index 0000000000..c6f6adafe2
--- /dev/null
+++ b/scripts/sk_collect_dumps/pyproject.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "sk-collect-dumps"
+version = "0.1.0"
+description = ""
+authors = ["Arseny Sher <sher-ars@yandex.ru>"]
+readme = "README.md"
+packages = [{include = "sk_collect_dumps"}]
+
+[tool.poetry.dependencies]
+python = "^3.11"
+ansible = "^9.1.0"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md
index 52b73e9495..7494a6cb78 100644
--- a/scripts/sk_collect_dumps/readme.md
+++ b/scripts/sk_collect_dumps/readme.md
@@ -1,25 +1,43 @@
 # Collect /v1/debug_dump from all safekeeper nodes
 
-1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
-2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
-
-## How to use ansible (staging)
-
+3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key):
 ```
-AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+# staging:
+AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+# prod:
+AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt')
+# check
+echo $AUTH_TOKEN
+```
+2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
 
-AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+There are two ways to do that, with ssm or tsh. ssm:
+```
+# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit):
+AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml
+```
+It will put the results to .results directory *near the playbook*.
+
+tsh:
+
+Update the inventory, if needed, selecting .build/.tech and optionally region:
+```
+rm -f hosts && echo '[safekeeper]' >> hosts
+# staging:
+tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts
+# prod:
+tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts
 ```
 
-## How to use ansible (prod)
-
+Test ansible connection:
 ```
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
-
-AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
+ansible all -m ping -v
 ```
 
+Download the dumps:
+```
+mkdir -p result && rm -f result/*
+ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml
+```
+
+3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml
index 29ce83efde..f214d0ae2c 100644
--- a/scripts/sk_collect_dumps/remote.yaml
+++ b/scripts/sk_collect_dumps/remote.yaml
@@ -1,18 +1,37 @@
 - name: Fetch state dumps from safekeepers
-  hosts: safekeepers
+  hosts: safekeeper
   gather_facts: False
-  remote_user: "{{ remote_user }}"
     
   tasks:
-    - name: Download file
+    - name: Dump file
       get_url:
         url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
-        dest: "/tmp/{{ inventory_hostname }}.json"
+        dest: "/tmp/{{ inventory_hostname }}-dump.json"
+        headers:
+          Authorization: "Bearer {{ auth_token }}"
 
-    - name: Fetch file from remote hosts
+    - name: install rsync
+      ansible.builtin.apt:
+        name: rsync
+        update_cache: yes
+      become: yes
+      ignore_errors: true # it can be already installed and we don't always have sudo
+
+    - name: Fetch file from remote hosts (works only with ssm)
       fetch:
-        src: "/tmp/{{ inventory_hostname }}.json"
-        dest: "./result/{{ inventory_hostname }}.json"
+        src: "/tmp/{{ inventory_hostname }}-dump.json"
+        dest: "./result/{{ inventory_hostname }}-dump.json"
         flat: yes
         fail_on_missing: no
+      when: ansible_connection == "aws_ssm"
 
+    # xxx not sure how to make ansible 'synchronize' work with tsh
+    - name: Fetch file from remote hosts
+      shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json"  "./result/{{ inventory_hostname }}-dump.json"
+      delegate_to: localhost
+      when: ansible_connection != "aws_ssm"
+
+    - name: remove remote dumps
+      ansible.builtin.file:
+        path: "/tmp/{{ inventory_hostname }}-dump.json"
+        state: absent
diff --git a/scripts/sk_collect_dumps/ssh.cfg b/scripts/sk_collect_dumps/ssh.cfg
new file mode 100644
index 0000000000..827c5d9286
--- /dev/null
+++ b/scripts/sk_collect_dumps/ssh.cfg
@@ -0,0 +1,13 @@
+# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh
+
+# Common flags for all teleport.aws.neon.tech hosts
+Host *
+    HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com
+
+# Flags for all teleport.aws.neon.tech hosts except the proxy
+Host * !teleport.aws.neon.tech
+    Port 3022
+    ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p
+    User developer
+
+# End generated Teleport configuration
\ No newline at end of file
diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh
index 2e54ecba1c..5189883fcb 100755
--- a/scripts/sk_collect_dumps/upload.sh
+++ b/scripts/sk_collect_dumps/upload.sh
@@ -31,22 +31,22 @@ SELECT
   (data->>'tenant_id') AS tenant_id,
   (data->>'timeline_id') AS timeline_id,
   (data->'memory'->>'active')::bool AS active,
-  (data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
-  (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
-  (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
-  (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
-  (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
-  (data->'memory'->>'write_lsn')::bigint AS write_lsn,
+  (data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn,
+  (data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn,
+  (data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn,
+  (data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn,
+  (data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn,
+  (data->'memory'->>'write_lsn')::pg_lsn AS write_lsn,
   (data->'memory'->>'num_computes')::bigint AS num_computes,
-  (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
+  (data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn,
   (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
   (data->'memory'->>'is_cancelled')::bool AS is_cancelled,
-  (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
-  (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
+  (data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn,
+  (data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn,
   (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
-  (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
-  (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
-  (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
-  (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
+  (data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn,
+  (data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn,
+  (data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn,
+  (data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn
 FROM tmp_json
 EOF
diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs
index a0c8e1f749..d66cbefa45 100644
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -3,9 +3,12 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use clap::Parser;
-use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
-use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
+
+use storage_broker::proto::SafekeeperTimelineInfo;
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SubscribeByFilterRequest,
+    TenantTimelineId as ProtoTenantTimelineId, TypeSubscription, TypedMessage,
+};
 
 use storage_broker::{BrokerClientChannel, DEFAULT_ENDPOINT};
 use tokio::time;
@@ -91,15 +94,23 @@ async fn subscribe(client: Option<BrokerClientChannel>, counter: Arc<AtomicU64>,
         None => storage_broker::connect(DEFAULT_ENDPOINT, Duration::from_secs(5)).unwrap(),
     };
 
-    let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId {
+    let ttid = ProtoTenantTimelineId {
         tenant_id: vec![0xFF; 16],
         timeline_id: tli_from_u64(i),
-    });
-    let request = SubscribeSafekeeperInfoRequest {
-        subscription_key: Some(key),
     };
-    let mut stream = client
-        .subscribe_safekeeper_info(request)
+
+    let request = SubscribeByFilterRequest {
+        types: vec![TypeSubscription {
+            r#type: MessageType::SafekeeperTimelineInfo.into(),
+        }],
+        tenant_timeline_id: Some(FilterTenantTimelineId {
+            enabled: true,
+            tenant_timeline_id: Some(ttid),
+        }),
+    };
+
+    let mut stream: tonic::Streaming<TypedMessage> = client
+        .subscribe_by_filter(request)
         .await
         .unwrap()
         .into_inner();
diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto
index aa9d62a29f..7d1b63d23f 100644
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -10,6 +10,12 @@ service BrokerService {
 
     // Publish safekeeper updates.
     rpc PublishSafekeeperInfo(stream SafekeeperTimelineInfo) returns (google.protobuf.Empty) {};
+
+    // Subscribe to all messages, limited by a filter.
+    rpc SubscribeByFilter(SubscribeByFilterRequest) returns (stream TypedMessage) {};
+
+    // Publish one message.
+    rpc PublishOne(TypedMessage) returns (google.protobuf.Empty) {};
 }
 
 message SubscribeSafekeeperInfoRequest {
@@ -48,3 +54,55 @@ message TenantTimelineId {
     bytes tenant_id = 1;
     bytes timeline_id = 2;
 }
+
+message FilterTenantTimelineId {
+    // If true, only messages related to `tenant_timeline_id` will be emitted.
+    // Otherwise, messages for all timelines will be emitted.
+    bool enabled = 1;
+    TenantTimelineId tenant_timeline_id = 2;
+}
+
+message TypeSubscription {
+    MessageType type = 1;
+}
+
+message SubscribeByFilterRequest {
+    // Subscription will emit messages only of the specified types. You need to specify
+    // at least one type to receive any messages.
+    repeated TypeSubscription types = 1;
+
+    // If set and enabled, subscription will emit messages only for the specified tenant/timeline.
+    optional FilterTenantTimelineId tenant_timeline_id = 2;
+}
+
+enum MessageType {
+    UNKNOWN = 0;
+    SAFEKEEPER_TIMELINE_INFO = 2;
+    SAFEKEEPER_DISCOVERY_REQUEST = 3;
+    SAFEKEEPER_DISCOVERY_RESPONSE = 4;
+}
+
+// A message with a type.
+message TypedMessage {
+    MessageType type = 1;
+
+    optional SafekeeperTimelineInfo safekeeper_timeline_info = 2;
+    optional SafekeeperDiscoveryRequest safekeeper_discovery_request = 3;
+    optional SafekeeperDiscoveryResponse safekeeper_discovery_response = 4;
+}
+
+message SafekeeperDiscoveryRequest {
+    TenantTimelineId tenant_timeline_id = 1;
+}
+
+// Shorter version of SafekeeperTimelineInfo, contains only necessary fields.
+message SafekeeperDiscoveryResponse {
+    uint64 safekeeper_id = 1;
+    TenantTimelineId tenant_timeline_id = 2;
+    // WAL available to download.
+    uint64 commit_lsn = 3;
+    // A connection string to use for WAL downloading.
+    string safekeeper_connstr = 4;
+    // Availability zone of a safekeeper.
+    optional string availability_zone = 5;
+}
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 9f81ac6cac..4e5f8ed724 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -35,10 +35,16 @@ use tracing::*;
 use utils::signals::ShutdownSignals;
 
 use metrics::{Encoder, TextEncoder};
-use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
+use storage_broker::metrics::{
+    BROADCASTED_MESSAGES_TOTAL, BROADCAST_DROPPED_MESSAGES_TOTAL, NUM_PUBS, NUM_SUBS_ALL,
+    NUM_SUBS_TIMELINE, PROCESSED_MESSAGES_TOTAL, PUBLISHED_ONEOFF_MESSAGES_TOTAL,
+};
 use storage_broker::proto::broker_service_server::{BrokerService, BrokerServiceServer};
 use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
-use storage_broker::proto::{SafekeeperTimelineInfo, SubscribeSafekeeperInfoRequest};
+use storage_broker::proto::{
+    FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
+    SafekeeperTimelineInfo, SubscribeByFilterRequest, SubscribeSafekeeperInfoRequest, TypedMessage,
+};
 use storage_broker::{
     parse_proto_ttid, EitherBody, DEFAULT_KEEPALIVE_INTERVAL, DEFAULT_LISTEN_ADDR,
 };
@@ -73,8 +79,103 @@ struct Args {
     log_format: String,
 }
 
-type PubId = u64; // id of publisher for registering in maps
-type SubId = u64; // id of subscriber for registering in maps
+/// Id of publisher for registering in maps
+type PubId = u64;
+
+/// Id of subscriber for registering in maps
+type SubId = u64;
+
+/// Single enum type for all messages.
+#[derive(Clone, Debug, PartialEq)]
+#[allow(clippy::enum_variant_names)]
+enum Message {
+    SafekeeperTimelineInfo(SafekeeperTimelineInfo),
+    SafekeeperDiscoveryRequest(SafekeeperDiscoveryRequest),
+    SafekeeperDiscoveryResponse(SafekeeperDiscoveryResponse),
+}
+
+impl Message {
+    /// Convert proto message to internal message.
+    pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
+        match proto_msg.r#type() {
+            MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
+                proto_msg.safekeeper_timeline_info.ok_or_else(|| {
+                    Status::new(Code::InvalidArgument, "missing safekeeper_timeline_info")
+                })?,
+            )),
+            MessageType::SafekeeperDiscoveryRequest => Ok(Message::SafekeeperDiscoveryRequest(
+                proto_msg.safekeeper_discovery_request.ok_or_else(|| {
+                    Status::new(
+                        Code::InvalidArgument,
+                        "missing safekeeper_discovery_request",
+                    )
+                })?,
+            )),
+            MessageType::SafekeeperDiscoveryResponse => Ok(Message::SafekeeperDiscoveryResponse(
+                proto_msg.safekeeper_discovery_response.ok_or_else(|| {
+                    Status::new(
+                        Code::InvalidArgument,
+                        "missing safekeeper_discovery_response",
+                    )
+                })?,
+            )),
+            MessageType::Unknown => Err(Status::new(
+                Code::InvalidArgument,
+                format!("invalid message type: {:?}", proto_msg.r#type),
+            )),
+        }
+    }
+
+    /// Get the tenant_timeline_id from the message.
+    pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
+        match self {
+            Message::SafekeeperTimelineInfo(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+            Message::SafekeeperDiscoveryRequest(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+            Message::SafekeeperDiscoveryResponse(msg) => Ok(msg
+                .tenant_timeline_id
+                .as_ref()
+                .map(parse_proto_ttid)
+                .transpose()?),
+        }
+    }
+
+    /// Convert internal message to the protobuf struct.
+    pub fn as_typed_message(&self) -> TypedMessage {
+        let mut res = TypedMessage {
+            r#type: self.message_type() as i32,
+            ..Default::default()
+        };
+        match self {
+            Message::SafekeeperTimelineInfo(msg) => {
+                res.safekeeper_timeline_info = Some(msg.clone())
+            }
+            Message::SafekeeperDiscoveryRequest(msg) => {
+                res.safekeeper_discovery_request = Some(msg.clone())
+            }
+            Message::SafekeeperDiscoveryResponse(msg) => {
+                res.safekeeper_discovery_response = Some(msg.clone())
+            }
+        }
+        res
+    }
+
+    /// Get the message type.
+    pub fn message_type(&self) -> MessageType {
+        match self {
+            Message::SafekeeperTimelineInfo(_) => MessageType::SafekeeperTimelineInfo,
+            Message::SafekeeperDiscoveryRequest(_) => MessageType::SafekeeperDiscoveryRequest,
+            Message::SafekeeperDiscoveryResponse(_) => MessageType::SafekeeperDiscoveryResponse,
+        }
+    }
+}
 
 #[derive(Copy, Clone, Debug)]
 enum SubscriptionKey {
@@ -83,7 +184,7 @@ enum SubscriptionKey {
 }
 
 impl SubscriptionKey {
-    // Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
+    /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
     pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
         match key {
             ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
@@ -92,14 +193,29 @@ impl SubscriptionKey {
             }
         }
     }
+
+    /// Parse from FilterTenantTimelineId
+    pub fn from_proto_filter_tenant_timeline_id(
+        f: &FilterTenantTimelineId,
+    ) -> Result<Self, Status> {
+        if !f.enabled {
+            return Ok(SubscriptionKey::All);
+        }
+
+        let ttid =
+            parse_proto_ttid(f.tenant_timeline_id.as_ref().ok_or_else(|| {
+                Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
+            })?)?;
+        Ok(SubscriptionKey::Timeline(ttid))
+    }
 }
 
-// Channel to timeline subscribers.
+/// Channel to timeline subscribers.
 struct ChanToTimelineSub {
-    chan: broadcast::Sender<SafekeeperTimelineInfo>,
-    // Tracked separately to know when delete the shmem entry. receiver_count()
-    // is unhandy for that as unregistering and dropping the receiver side
-    // happens at different moments.
+    chan: broadcast::Sender<Message>,
+    /// Tracked separately to know when delete the shmem entry. receiver_count()
+    /// is unhandy for that as unregistering and dropping the receiver side
+    /// happens at different moments.
     num_subscribers: u64,
 }
 
@@ -110,7 +226,7 @@ struct SharedState {
     num_subs_to_timelines: i64,
     chans_to_timeline_subs: HashMap<TenantTimelineId, ChanToTimelineSub>,
     num_subs_to_all: i64,
-    chan_to_all_subs: broadcast::Sender<SafekeeperTimelineInfo>,
+    chan_to_all_subs: broadcast::Sender<Message>,
 }
 
 impl SharedState {
@@ -146,7 +262,7 @@ impl SharedState {
         &mut self,
         sub_key: SubscriptionKey,
         timeline_chan_size: usize,
-    ) -> (SubId, broadcast::Receiver<SafekeeperTimelineInfo>) {
+    ) -> (SubId, broadcast::Receiver<Message>) {
         let sub_id = self.next_sub_id;
         self.next_sub_id += 1;
         let sub_rx = match sub_key {
@@ -262,6 +378,29 @@ impl Registry {
             subscriber.id, subscriber.key, subscriber.remote_addr
         );
     }
+
+    /// Send msg to relevant subscribers.
+    pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
+        PROCESSED_MESSAGES_TOTAL.inc();
+
+        // send message to subscribers for everything
+        let shared_state = self.shared_state.read();
+        // Err means there is no subscribers, it is fine.
+        shared_state.chan_to_all_subs.send(msg.clone()).ok();
+
+        // send message to per timeline subscribers, if there is ttid
+        let ttid = msg.tenant_timeline_id()?;
+        if let Some(ttid) = ttid {
+            if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
+                // Err can't happen here, as tx is destroyed only after removing
+                // from the map the last subscriber along with tx.
+                subs.chan
+                    .send(msg.clone())
+                    .expect("rx is still in the map with zero subscribers");
+            }
+        }
+        Ok(())
+    }
 }
 
 // Private subscriber state.
@@ -269,7 +408,7 @@ struct Subscriber {
     id: SubId,
     key: SubscriptionKey,
     // Subscriber receives messages from publishers here.
-    sub_rx: broadcast::Receiver<SafekeeperTimelineInfo>,
+    sub_rx: broadcast::Receiver<Message>,
     // to unregister itself from shared state in Drop
     registry: Registry,
     // for logging
@@ -291,26 +430,9 @@ struct Publisher {
 }
 
 impl Publisher {
-    // Send msg to relevant subscribers.
-    pub fn send_msg(&mut self, msg: &SafekeeperTimelineInfo) -> Result<(), Status> {
-        // send message to subscribers for everything
-        let shared_state = self.registry.shared_state.read();
-        // Err means there is no subscribers, it is fine.
-        shared_state.chan_to_all_subs.send(msg.clone()).ok();
-
-        // send message to per timeline subscribers
-        let ttid =
-            parse_proto_ttid(msg.tenant_timeline_id.as_ref().ok_or_else(|| {
-                Status::new(Code::InvalidArgument, "missing tenant_timeline_id")
-            })?)?;
-        if let Some(subs) = shared_state.chans_to_timeline_subs.get(&ttid) {
-            // Err can't happen here, as tx is destroyed only after removing
-            // from the map the last subscriber along with tx.
-            subs.chan
-                .send(msg.clone())
-                .expect("rx is still in the map with zero subscribers");
-        }
-        Ok(())
+    /// Send msg to relevant subscribers.
+    pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
+        self.registry.send_msg(msg)
     }
 }
 
@@ -339,7 +461,7 @@ impl BrokerService for Broker {
 
         loop {
             match stream.next().await {
-                Some(Ok(msg)) => publisher.send_msg(&msg)?,
+                Some(Ok(msg)) => publisher.send_msg(&Message::SafekeeperTimelineInfo(msg))?,
                 Some(Err(e)) => return Err(e), // grpc error from the stream
                 None => break,                 // closed stream
             }
@@ -371,8 +493,15 @@ impl BrokerService for Broker {
             let mut missed_msgs: u64 = 0;
             loop {
                 match subscriber.sub_rx.recv().await {
-                    Ok(info) => yield info,
+                    Ok(info) => {
+                        match info {
+                            Message::SafekeeperTimelineInfo(info) => yield info,
+                            _ => {},
+                        }
+                        BROADCASTED_MESSAGES_TOTAL.inc();
+                    },
                     Err(RecvError::Lagged(skipped_msg)) => {
+                        BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
                         missed_msgs += skipped_msg;
                         if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
                             warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
@@ -392,6 +521,78 @@ impl BrokerService for Broker {
             Box::pin(output) as Self::SubscribeSafekeeperInfoStream
         ))
     }
+
+    type SubscribeByFilterStream =
+        Pin<Box<dyn Stream<Item = Result<TypedMessage, Status>> + Send + 'static>>;
+
+    /// Subscribe to all messages, limited by a filter.
+    async fn subscribe_by_filter(
+        &self,
+        request: Request<SubscribeByFilterRequest>,
+    ) -> std::result::Result<Response<Self::SubscribeByFilterStream>, Status> {
+        let remote_addr = request
+            .remote_addr()
+            .expect("TCPConnectInfo inserted by handler");
+        let proto_filter = request.into_inner();
+        let ttid_filter = proto_filter
+            .tenant_timeline_id
+            .as_ref()
+            .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?;
+
+        let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?;
+        let types_set = proto_filter
+            .types
+            .iter()
+            .map(|t| t.r#type)
+            .collect::<std::collections::HashSet<_>>();
+
+        let mut subscriber = self.registry.register_subscriber(sub_key, remote_addr);
+
+        // transform rx into stream with item = Result, as method result demands
+        let output = async_stream::try_stream! {
+            let mut warn_interval = time::interval(Duration::from_millis(1000));
+            let mut missed_msgs: u64 = 0;
+            loop {
+                match subscriber.sub_rx.recv().await {
+                    Ok(msg) => {
+                        let msg_type = msg.message_type() as i32;
+                        if types_set.contains(&msg_type) {
+                            yield msg.as_typed_message();
+                            BROADCASTED_MESSAGES_TOTAL.inc();
+                        }
+                    },
+                    Err(RecvError::Lagged(skipped_msg)) => {
+                        BROADCAST_DROPPED_MESSAGES_TOTAL.inc_by(skipped_msg);
+                        missed_msgs += skipped_msg;
+                        if (futures::poll!(Box::pin(warn_interval.tick()))).is_ready() {
+                            warn!("subscription id={}, key={:?} addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
+                            missed_msgs = 0;
+                        }
+                    }
+                    Err(RecvError::Closed) => {
+                        // can't happen, we never drop the channel while there is a subscriber
+                        Err(Status::new(Code::Internal, "channel unexpectantly closed"))?;
+                    }
+                }
+            }
+        };
+
+        Ok(Response::new(
+            Box::pin(output) as Self::SubscribeByFilterStream
+        ))
+    }
+
+    /// Publish one message.
+    async fn publish_one(
+        &self,
+        request: Request<TypedMessage>,
+    ) -> std::result::Result<Response<()>, Status> {
+        let msg = Message::from(request.into_inner())?;
+        PUBLISHED_ONEOFF_MESSAGES_TOTAL.inc();
+        self.registry.send_msg(&msg)?;
+        Ok(Response::new(()))
+    }
 }
 
 // We serve only metrics and healthcheck through http1.
@@ -515,8 +716,8 @@ mod tests {
     use tokio::sync::broadcast::error::TryRecvError;
     use utils::id::{TenantId, TimelineId};
 
-    fn msg(timeline_id: Vec<u8>) -> SafekeeperTimelineInfo {
-        SafekeeperTimelineInfo {
+    fn msg(timeline_id: Vec<u8>) -> Message {
+        Message::SafekeeperTimelineInfo(SafekeeperTimelineInfo {
             safekeeper_id: 1,
             tenant_timeline_id: Some(ProtoTenantTimelineId {
                 tenant_id: vec![0x00; 16],
@@ -533,7 +734,7 @@ mod tests {
             http_connstr: "neon-1-sk-1.local:7677".to_owned(),
             local_start_lsn: 0,
             availability_zone: None,
-        }
+        })
     }
 
     fn tli_from_u64(i: u64) -> Vec<u8> {
diff --git a/storage_broker/src/metrics.rs b/storage_broker/src/metrics.rs
index f0649d0f68..1fd3dd5ad6 100644
--- a/storage_broker/src/metrics.rs
+++ b/storage_broker/src/metrics.rs
@@ -1,6 +1,6 @@
 //! Broker metrics.
 
-use metrics::{register_int_gauge, IntGauge};
+use metrics::{register_int_counter, register_int_gauge, IntCounter, IntGauge};
 use once_cell::sync::Lazy;
 
 pub static NUM_PUBS: Lazy<IntGauge> = Lazy::new(|| {
@@ -23,3 +23,35 @@ pub static NUM_SUBS_ALL: Lazy<IntGauge> = Lazy::new(|| {
     )
     .expect("Failed to register metric")
 });
+
+pub static PROCESSED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_processed_messages_total",
+        "Number of messages received by storage broker, before routing and broadcasting"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static BROADCASTED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_broadcasted_messages_total",
+        "Number of messages broadcasted (sent over network) to subscribers"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static BROADCAST_DROPPED_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_broadcast_dropped_messages_total",
+        "Number of messages dropped due to channel capacity overflow"
+    )
+    .expect("Failed to register metric")
+});
+
+pub static PUBLISHED_ONEOFF_MESSAGES_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "storage_broker_published_oneoff_messages_total",
+        "Number of one-off messages sent via PublishOne method"
+    )
+    .expect("Failed to register metric")
+});
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 42e122cefe..597e311e02 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -365,6 +365,12 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
+    def safe_psql_scalar(self, query) -> Any:
+        """
+        Execute query returning single row with single column.
+        """
+        return self.safe_psql(query)[0][0]
+
 
 @dataclass
 class AuthKeys:
@@ -457,7 +463,6 @@ class NeonEnvBuilder:
         self.preserve_database_files = preserve_database_files
         self.initial_tenant = initial_tenant or TenantId.generate()
         self.initial_timeline = initial_timeline or TimelineId.generate()
-        self.enable_generations = True
         self.scrub_on_exit = False
         self.test_output_dir = test_output_dir
 
@@ -677,8 +682,7 @@ class NeonEnvBuilder:
 
                 pageserver.stop(immediate=True)
 
-            if self.env.attachment_service is not None:
-                self.env.attachment_service.stop(immediate=True)
+            self.env.attachment_service.stop(immediate=True)
 
             cleanup_error = None
 
@@ -772,13 +776,9 @@ class NeonEnv:
         self.initial_tenant = config.initial_tenant
         self.initial_timeline = config.initial_timeline
 
-        if config.enable_generations:
-            attachment_service_port = self.port_distributor.get_port()
-            self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}"
-            self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self)
-        else:
-            self.control_plane_api = None
-            self.attachment_service = None
+        attachment_service_port = self.port_distributor.get_port()
+        self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
+        self.attachment_service: NeonAttachmentService = NeonAttachmentService(self)
 
         # Create a config file corresponding to the options
         cfg: Dict[str, Any] = {
@@ -851,8 +851,7 @@ class NeonEnv:
         # Start up broker, pageserver and all safekeepers
         self.broker.try_start()
 
-        if self.attachment_service is not None:
-            self.attachment_service.start()
+        self.attachment_service.start()
 
         for pageserver in self.pageservers:
             pageserver.start()
@@ -1834,20 +1833,19 @@ class NeonPageserver(PgProtocol):
         """
         client = self.http_client()
         return client.tenant_attach(
-            tenant_id, config, config_null, generation=self.maybe_get_generation(tenant_id)
+            tenant_id,
+            config,
+            config_null,
+            generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id),
         )
 
     def tenant_detach(self, tenant_id: TenantId):
-        if self.env.attachment_service is not None:
-            self.env.attachment_service.attach_hook_drop(tenant_id)
+        self.env.attachment_service.attach_hook_drop(tenant_id)
 
         client = self.http_client()
         return client.tenant_detach(tenant_id)
 
     def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
-        # This API is only for use when generations are enabled
-        assert self.env.attachment_service is not None
-
         if config["mode"].startswith("Attached") and "generation" not in config:
             config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
 
@@ -1873,26 +1871,15 @@ class NeonPageserver(PgProtocol):
         generation: Optional[int] = None,
     ) -> TenantId:
         if generation is None:
-            generation = self.maybe_get_generation(tenant_id)
+            generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
         client = self.http_client(auth_token=auth_token)
         return client.tenant_create(tenant_id, conf, generation=generation)
 
     def tenant_load(self, tenant_id: TenantId):
         client = self.http_client()
-        return client.tenant_load(tenant_id, generation=self.maybe_get_generation(tenant_id))
-
-    def maybe_get_generation(self, tenant_id: TenantId):
-        """
-        For tests that would like to use an HTTP client directly instead of using
-        the `tenant_attach` and `tenant_create` helpers here: issue a generation
-        number for a tenant.
-
-        Returns None if the attachment service is not enabled (legacy mode)
-        """
-        if self.env.attachment_service is not None:
-            return self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
-        else:
-            return None
+        return client.tenant_load(
+            tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
+        )
 
 
 def append_pageserver_param_overrides(
@@ -2752,6 +2739,13 @@ class Endpoint(PgProtocol):
     ):
         self.stop()
 
+    # Checkpoints running endpoint and returns pg_wal size in MB.
+    def get_pg_wal_size(self):
+        log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
+        self.safe_psql("checkpoint")
+        assert self.pgdata_dir is not None  # please mypy
+        return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
+
 
 class EndpointFactory:
     """An object representing multiple compute endpoints."""
@@ -2950,6 +2944,13 @@ class Safekeeper:
         return segments
 
 
+# Walreceiver as returned by sk's timeline status endpoint.
+@dataclass
+class Walreceiver:
+    conn_id: int
+    state: str
+
+
 @dataclass
 class SafekeeperTimelineStatus:
     acceptor_epoch: int
@@ -2960,6 +2961,7 @@ class SafekeeperTimelineStatus:
     backup_lsn: Lsn
     peer_horizon_lsn: Lsn
     remote_consistent_lsn: Lsn
+    walreceivers: List[Walreceiver]
 
 
 @dataclass
@@ -3021,6 +3023,7 @@ class SafekeeperHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
         res.raise_for_status()
         resj = res.json()
+        walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         return SafekeeperTimelineStatus(
             acceptor_epoch=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
@@ -3030,6 +3033,7 @@ class SafekeeperHttpClient(requests.Session):
             backup_lsn=Lsn(resj["backup_lsn"]),
             peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
             remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
+            walreceivers=walreceivers,
         )
 
     def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index eda8813c36..add6c4288a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -510,13 +510,21 @@ class PageserverHttpClient(requests.Session):
         assert res_json is None
 
     def timeline_get_lsn_by_timestamp(
-        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        timestamp,
+        version: Optional[int] = None,
     ):
         log.info(
             f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
         )
+        if version is None:
+            version_str = ""
+        else:
+            version_str = f"&version={version}"
         res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}",
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py
index d95368f990..ea648e460d 100644
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -125,3 +125,51 @@ class TenantId(Id):
 class TimelineId(Id):
     def __repr__(self) -> str:
         return f'TimelineId("{self.id.hex()}")'
+
+
+# Workaround for compat with python 3.9, which does not have `typing.Self`
+TTenantShardId = TypeVar("TTenantShardId", bound="TenantShardId")
+
+
+class TenantShardId:
+    def __init__(self, tenant_id: TenantId, shard_number: int, shard_count: int):
+        self.tenant_id = tenant_id
+        self.shard_number = shard_number
+        self.shard_count = shard_count
+        assert self.shard_number < self.shard_count or self.shard_count == 0
+
+    @classmethod
+    def parse(cls: Type[TTenantShardId], input) -> TTenantShardId:
+        if len(input) == 32:
+            return cls(
+                tenant_id=TenantId(input),
+                shard_number=0,
+                shard_count=0,
+            )
+        elif len(input) == 37:
+            return cls(
+                tenant_id=TenantId(input[0:32]),
+                shard_number=int(input[33:35], 16),
+                shard_count=int(input[35:37], 16),
+            )
+        else:
+            raise ValueError(f"Invalid TenantShardId '{input}'")
+
+    def __str__(self):
+        return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}"
+
+    def _tuple(self) -> tuple[TenantId, int, int]:
+        return (self.tenant_id, self.shard_number, self.shard_count)
+
+    def __lt__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+        return self._tuple() < other._tuple()
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return NotImplemented
+        return self._tuple() == other._tuple()
+
+    def __hash__(self) -> int:
+        return hash(self._tuple())
diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py
index a2a1fa11e5..edc23b29ba 100644
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -61,7 +61,6 @@ def measure_recovery_time(env: NeonCompare):
     # of view, but the same as far as the safekeeper/WAL is concerned.  To work around that,
     # we will explicitly create the tenant in the same generation that it was previously
     # attached in.
-    assert env.env.attachment_service is not None
     attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
     assert attach_status is not None
     (attach_gen, _) = attach_status
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 0f7615f7ed..1e6e9a0174 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -17,6 +17,27 @@ class LabelledQuery:
     query: str
 
 
+# This must run before all tests in this module
+# create extension pg_stat_statements if it does not exist
+# and TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false)
+# Theoretically this could be in a module or session scope fixture,
+# however the code depends on other fixtures that have function scope
+@pytest.mark.skipif(
+    os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false",
+    reason="Skipping - Creating extension pg_stat_statements",
+)
+@pytest.mark.remote_cluster
+def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
+    log.info("Creating extension pg_stat_statements")
+    query = LabelledQuery(
+        "Q_CREATE_EXTENSION", r"CREATE EXTENSION IF NOT EXISTS pg_stat_statements;"
+    )
+    run_psql(remote_compare, query, times=1, explain=False)
+    log.info("Reset pg_stat_statements")
+    query = LabelledQuery("Q_RESET", r"SELECT pg_stat_statements_reset();")
+    run_psql(remote_compare, query, times=1, explain=False)
+
+
 # A list of queries to run.
 # Please do not alter the label for the query, as it is used to identify it.
 # Labels for ClickBench queries match the labels in ClickBench reports
@@ -78,6 +99,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
     # fmt: on
 )
 
+EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
+
 
 def get_scale() -> List[str]:
     # We parametrize each tpc-h and clickbench test with scale
@@ -88,7 +111,10 @@ def get_scale() -> List[str]:
     return [scale]
 
 
-def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None:
+# run the query times times plus once with EXPLAIN VERBOSE if explain is requestd
+def run_psql(
+    env: RemoteCompare, labelled_query: LabelledQuery, times: int, explain: bool = False
+) -> None:
     # prepare connstr:
     # - cut out password from connstr to pass it via env
     # - add options to connstr
@@ -108,6 +134,13 @@ def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> N
         log.info(f"Run {run}/{times}")
         with env.zenbenchmark.record_duration(f"{label}/{run}"):
             env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ)
+    if explain:
+        log.info(f"Explaining query {label}")
+        run += 1
+        with env.zenbenchmark.record_duration(f"{label}/EXPLAIN"):
+            env.pg_bin.run_capture(
+                ["psql", connstr, "-c", f"{EXPLAIN_STRING} {query}"], env=environ
+            )
 
 
 @pytest.mark.parametrize("scale", get_scale())
@@ -118,10 +151,13 @@ def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare, scale:
     An OLAP-style ClickHouse benchmark
 
     Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
-    The DB prepared manually in advance
+    The DB prepared manually in advance.
+    Important: after intial data load, run `VACUUM (DISABLE_PAGE_SKIPPING, FREEZE, ANALYZE) hits;`
+    to ensure that Postgres optimizer chooses the same plans as RDS and Aurora.
     """
+    explain: bool = os.getenv("TEST_OLAP_COLLECT_EXPLAIN", "false").lower() == "true"
 
-    run_psql(remote_compare, query, times=3)
+    run_psql(remote_compare, query, times=3, explain=explain)
 
 
 def tpch_queuies() -> Tuple[ParameterSet, ...]:
@@ -195,3 +231,16 @@ def test_user_examples(remote_compare: RemoteCompare):
         """,
     )
     run_psql(remote_compare, query, times=3)
+
+
+# This must run after all tests in this module
+# Collect pg_stat_statements after running the tests if TEST_OLAP_COLLECT_PG_STAT_STATEMENTS is set to true (default false)
+@pytest.mark.skipif(
+    os.getenv("TEST_OLAP_COLLECT_PG_STAT_STATEMENTS", "false").lower() == "false",
+    reason="Skipping - Collecting pg_stat_statements",
+)
+@pytest.mark.remote_cluster
+def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare):
+    log.info("Collecting pg_stat_statements")
+    query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;")
+    run_psql(remote_compare, query, times=1, explain=False)
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 352ec13884..32397bbcc1 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -136,10 +136,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]):
     ps_http.tenant_detach(tenant_id)
     assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()]
 
-    body = {}
-    gen = env.pageserver.maybe_get_generation(tenant_id)
-    if gen is not None:
-        body["generation"] = gen
+    body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)}
 
     ps_http.post(
         f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach",
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 1b6c982850..adb67a579e 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -87,7 +87,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     #
     # Since we're dual-attached, need to tip-off attachment service to treat the one we're
     # about to start as the attached pageserver
-    assert env.attachment_service is not None
     env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id)
     env.pageservers[0].start()
     env.pageservers[1].stop()
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index f3f3a1ddf3..9fdc4d59f5 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -1,6 +1,7 @@
+import enum
 import time
 from dataclasses import dataclass
-from typing import Dict, Tuple
+from typing import Any, Dict, Tuple
 
 import pytest
 import toml
@@ -64,6 +65,23 @@ def test_min_resident_size_override_handling(
     assert_config(tenant_id, None, config_level_override)
 
 
+@enum.unique
+class EvictionOrder(str, enum.Enum):
+    ABSOLUTE_ORDER = "absolute"
+    RELATIVE_ORDER_EQUAL = "relative_equal"
+    RELATIVE_ORDER_SPARE = "relative_spare"
+
+    def config(self) -> Dict[str, Any]:
+        if self == EvictionOrder.ABSOLUTE_ORDER:
+            return {"type": "AbsoluteAccessed"}
+        elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
+            return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}}
+        elif self == EvictionOrder.RELATIVE_ORDER_SPARE:
+            return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}}
+        else:
+            raise RuntimeError(f"not implemented: {self}")
+
+
 @dataclass
 class EvictionEnv:
     timelines: list[Tuple[TenantId, TimelineId]]
@@ -108,13 +126,14 @@ class EvictionEnv:
                     _avg = cur.fetchone()
 
     def pageserver_start_with_disk_usage_eviction(
-        self, period, max_usage_pct, min_avail_bytes, mock_behavior
+        self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder
     ):
         disk_usage_config = {
             "period": period,
             "max_usage_pct": max_usage_pct,
             "min_avail_bytes": min_avail_bytes,
             "mock_statvfs": mock_behavior,
+            "eviction_order": eviction_order.config(),
         }
 
         enc = toml.TomlEncoder()
@@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
-def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_evicts_until_pressure_is_relieved(
+    eviction_env: EvictionEnv, order: EvictionOrder
+):
     """
     Basic test to ensure that we evict enough to relieve pressure.
     """
@@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
 
     target = total_on_disk // 2
 
-    response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = pageserver_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
     assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
 
 
-def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_respects_overridden_resident_size(
+    eviction_env: EvictionEnv, order: EvictionOrder
+):
     """
     Override tenant min resident and ensure that it will be respected by eviction.
     """
@@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
     env.warm_up_tenant(large_tenant[0])
 
     # do one run
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     time.sleep(1)  # give log time to flush
@@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
     assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
 
 
-def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
+)
+def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
     """
     If we can't relieve pressure using tenant_min_resident_size-respecting eviction,
     we should continue to evict layers following global LRU.
@@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
     (total_on_disk, _, _) = env.timelines_du()
     target = total_on_disk
 
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
     env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
 
 
-def test_partial_evict_tenant(eviction_env: EvictionEnv):
+@pytest.mark.parametrize(
+    "order",
+    [
+        EvictionOrder.ABSOLUTE_ORDER,
+        EvictionOrder.RELATIVE_ORDER_EQUAL,
+        EvictionOrder.RELATIVE_ORDER_SPARE,
+    ],
+)
+def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
     """
     Warm up a tenant, then build up pressure to cause in evictions in both.
     We expect
@@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     (total_on_disk, _, _) = env.timelines_du()
     du_by_timeline = env.du_by_timeline()
 
-    # pick any tenant
+    # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6)
     [warm, cold] = list(du_by_timeline.keys())
     (tenant_id, timeline_id) = warm
 
@@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
     # but not enough to fall into global LRU.
     # So, set target to all occupied space, except 2*env.layer_size per tenant
     target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size
-    response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
+    response = ps_http.disk_usage_eviction_run(
+        {"evict_bytes": target, "eviction_order": order.config()}
+    )
     log.info(f"{response}")
 
     (later_total_on_disk, _, _) = env.timelines_du()
@@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
         ), "all tenants should have lost some layers"
 
     warm_size = later_du_by_timeline[warm]
-
-    # bounds for warmed_size
-    warm_lower = 0.5 * du_by_timeline[warm]
-
-    # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
-    # So, check for up to 3 here.
-    warm_upper = warm_lower + 3 * env.layer_size
-
     cold_size = later_du_by_timeline[cold]
-    cold_upper = 2 * env.layer_size
 
-    log.info(
-        f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
-    )
-    log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # bounds for warmed_size
+        warm_lower = 0.5 * du_by_timeline[warm]
 
-    assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
-    assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+        # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
+        # So, check for up to 3 here.
+        warm_upper = warm_lower + 3 * env.layer_size
 
-    assert (
-        cold_size < cold_upper
-    ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+        cold_upper = 2 * env.layer_size
+        log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
+        log.info(
+            f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
+        )
+        log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
+
+        assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
+        assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
+
+        assert (
+            cold_size < cold_upper
+        ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
+    else:
+        # just go with the space was freed, find proper limits later
+        pass
 
 
 def poor_mans_du(
@@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
             "type": "Failure",
             "mocked_error": "EIO",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
@@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     def relieved_log_message():
@@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
             # This avoids accounting for metadata files & tenant conf in the tests.
             "name_filter": ".*__.*",
         },
+        eviction_order=EvictionOrder.ABSOLUTE_ORDER,
     )
 
     def relieved_log_message():
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index ef2b2185c3..340188c1ae 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -157,7 +157,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     time.sleep(1.1)  # so that we can use change in pre_stat.st_mtime to detect overwrites
 
     def get_generation_number():
-        assert env.attachment_service is not None
         attachment = env.attachment_service.inspect(tenant_id)
         assert attachment is not None
         return attachment[0]
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index f79c1c347c..65d6d7a9fd 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -8,71 +8,6 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar
 
 
-#
-# Test pageserver get_lsn_by_timestamp API
-#
-def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-
-    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
-    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
-    log.info("postgres is running on 'test_lsn_mapping' branch")
-
-    cur = endpoint_main.connect().cursor()
-    # Create table, and insert rows, each in a separate transaction
-    # Disable synchronous_commit to make this initialization go faster.
-    #
-    # Each row contains current insert LSN and the current timestamp, when
-    # the row was inserted.
-    cur.execute("SET synchronous_commit=off")
-    cur.execute("CREATE TABLE foo (x integer)")
-    tbl = []
-    for i in range(1000):
-        cur.execute("INSERT INTO foo VALUES(%s)", (i,))
-        # Get the timestamp at UTC
-        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
-        tbl.append([i, after_timestamp])
-
-    # Execute one more transaction with synchronous_commit enabled, to flush
-    # all the previous transactions
-    cur.execute("SET synchronous_commit=on")
-    cur.execute("INSERT INTO foo VALUES (-1)")
-
-    # Wait until WAL is received by pageserver
-    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
-
-    with env.pageserver.http_client() as client:
-        # Check edge cases: timestamp in the future
-        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
-        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-        )
-        assert result == "future"
-
-        # timestamp too the far history
-        probe_timestamp = tbl[0][1] - timedelta(hours=10)
-        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-        )
-        assert result == "past"
-
-        # Probe a bunch of timestamps in the valid range
-        for i in range(1, len(tbl), 100):
-            probe_timestamp = tbl[i][1]
-            lsn = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
-            )
-            # Call get_lsn_by_timestamp to get the LSN
-            # Launch a new read-only node at that LSN, and check that only the rows
-            # that were supposed to be committed at that point in time are visible.
-            endpoint_here = env.endpoints.create_start(
-                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
-            )
-            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
-
-            endpoint_here.stop_and_destroy()
-
-
 #
 # Test pageserver get_lsn_by_timestamp API
 #
@@ -130,7 +65,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the future
         probe_timestamp = tbl[-1][1] + timedelta(hours=1)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "future"
         # make sure that we return a well advanced lsn here
@@ -139,7 +74,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         # Timestamp is in the unreachable past
         probe_timestamp = tbl[0][1] - timedelta(hours=10)
         result = client.timeline_get_lsn_by_timestamp(
-            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
         )
         assert result["kind"] == "past"
         # make sure that we return the minimum lsn here at the start of the range
@@ -149,7 +84,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
         for i in range(1, len(tbl), 100):
             probe_timestamp = tbl[i][1]
             result = client.timeline_get_lsn_by_timestamp(
-                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z"
             )
             assert result["kind"] not in ["past", "nodata"]
             lsn = result["lsn"]
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 64e41a2dd5..573d2139ce 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -72,7 +72,9 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
 
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
-    client.tenant_create(tenant_id, generation=env.pageserver.maybe_get_generation(tenant_id))
+    client.tenant_create(
+        tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)
+    )
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
     timelines = client.timeline_list(tenant_id)
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 4488be31c5..9c2f5786d4 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -187,7 +187,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     - After upgrade, the bucket should contain a mixture.
     - In both cases, postgres I/O should work.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -196,7 +195,6 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
     env.broker.try_start()
     for sk in env.safekeepers:
         sk.start()
-    assert env.attachment_service is not None
     env.attachment_service.start()
 
     env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
@@ -262,12 +260,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
 
 
 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     some_other_pageserver = 1234
     ps_http = env.pageserver.http_client()
@@ -341,7 +337,6 @@ def test_deletion_queue_recovery(
     :param validate_before: whether to wait for deletions to be validated before restart.  This
     makes them elegible to be executed after restart, if the same node keeps the attachment.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -405,7 +400,6 @@ def test_deletion_queue_recovery(
 
     if keep_attachment == KeepAttachment.LOSE:
         some_other_pageserver = 101010
-        assert env.attachment_service is not None
         env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver)
 
     env.pageserver.start()
@@ -453,7 +447,6 @@ def test_deletion_queue_recovery(
 
 
 def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -473,7 +466,6 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     )
 
     # Simulate a major incident: the control plane goes offline
-    assert env.attachment_service is not None
     env.attachment_service.stop()
 
     # Remember how many validations had happened before the control plane went offline
@@ -545,7 +537,6 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
     and must be constructed using the proper generation for the layer, which may not be the same generation
     that the tenant is running in.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.enable_pageserver_remote_storage(
         RemoteStorageKind.MOCK_S3,
     )
@@ -575,7 +566,6 @@ def test_multi_attach(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
 ):
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 3cac32b790..c4499196b5 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -9,9 +9,7 @@ from fixtures.utils import wait_until
 
 # Test restarting page server, while safekeeper and compute node keep
 # running.
-@pytest.mark.parametrize("generations", [True, False])
-def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool):
-    neon_env_builder.enable_generations = generations
+def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 64ade346aa..8ae4297983 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -57,13 +57,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     states are valid, so that we may test it in this way: the API should always
     work as long as the tenant exists.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 3
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     pageservers = env.pageservers
     list([p.http_client() for p in pageservers])
@@ -210,13 +208,11 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     """
     Test the sequence of location states that are used in a live migration.
     """
-    neon_env_builder.enable_generations = True
     neon_env_builder.num_pageservers = 2
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=RemoteStorageKind.MOCK_S3,
     )
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
-    assert env.attachment_service is not None
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 3004d69f50..2fda56d0f4 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -60,8 +60,6 @@ def test_remote_storage_backup_and_restore(
 
     neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
 
-    neon_env_builder.enable_generations = generations
-
     # Exercise retry code path by making all uploads and downloads fail for the
     # first time. The retries print INFO-messages to the log; we will check
     # that they are present after the test.
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index c6d578a7a2..82ffcb1177 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -263,15 +263,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
                 ps_http, env.initial_tenant, timeline_id, iterations=iterations
             )
 
-            if failpoint == "timeline-delete-after-index-delete":
-                m = ps_http.get_metrics()
-                assert (
-                    m.query_one(
-                        "remote_storage_s3_request_seconds_count",
-                        filter={"request_type": "get_object", "result": "ok"},
-                    ).value
-                    == 1  # index part for initial timeline
-                )
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
         # this also checks that delete can be retried even when timeline is in Broken state
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 6e510b2eba..11685d1d48 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import math
 import queue
 import random
@@ -24,6 +25,7 @@ from fixtures.pageserver.utils import (
     assert_tenant_state,
     timeline_delete_wait_completed,
     wait_for_upload_queue_empty,
+    wait_tenant_status_404,
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
@@ -776,6 +778,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     def get_tenant_states():
         states = {}
+        log.info(f"Tenant ids: {tenant_ids}")
         for tenant_id in tenant_ids:
             tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
             states[tenant_id] = tenant["state"]["slug"]
@@ -872,3 +875,51 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
     )
     assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+
+    # Check that tenant deletion proactively wakes tenants: this is done separately to the main
+    # body of the test because it will disrupt tenant counts
+    env.pageserver.stop()
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
+    )
+
+    wait_until(10, 1, at_least_one_active)
+    delete_tenant_id = list(
+        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
+    )[0][0]
+
+    # Deleting a stuck tenant should prompt it to go active
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        log.info("Starting background delete")
+
+        def delete_tenant():
+            env.pageserver.http_client().tenant_delete(delete_tenant_id)
+
+        background_delete = executor.submit(delete_tenant)
+
+        # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
+        # logical size is paused in a failpoint.  So instead we will use a log observation to check that
+        # on-demand activation was triggered by the tenant deletion
+        log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"
+
+        def activated_on_demand():
+            assert env.pageserver.log_contains(log_match) is not None
+
+        log.info(f"Waiting for activation message '{log_match}'")
+        try:
+            wait_until(10, 1, activated_on_demand)
+        finally:
+            log.info("Clearing failpoint")
+            pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
+
+        # Deletion should complete successfully now that failpoint is unblocked
+        log.info("Joining background delete")
+        background_delete.result(timeout=10)
+
+        # Poll for deletion to complete
+        wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
+        tenant_ids.remove(delete_tenant_id)
+
+    # Check that all the stuck tenants proceed to active (apart from the one that deletes)
+    wait_until(10, 1, all_active)
+    assert len(get_tenant_states()) == n_tenants - 1
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 3c40a9cb3e..cf8df389c8 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -419,7 +419,8 @@ def wait(f, desc, timeout=30, wait_f=None):
         try:
             if f():
                 break
-        except Exception:
+        except Exception as e:
+            log.info(f"got exception while waiting for {desc}: {e}")
             pass
         elapsed = time.time() - started_at
         if elapsed > timeout:
@@ -1001,8 +1002,40 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
         endpoint.start()
 
 
+# Context manager which logs passed time on exit.
+class DurationLogger:
+    def __init__(self, desc):
+        self.desc = desc
+
+    def __enter__(self):
+        self.ts_before = time.time()
+
+    def __exit__(self, *exc):
+        log.info(f"{self.desc} finished in {time.time() - self.ts_before}s")
+
+
+# Context manager which logs WAL position change on exit.
+class WalChangeLogger:
+    def __init__(self, ep, desc_before):
+        self.ep = ep
+        self.desc_before = desc_before
+
+    def __enter__(self):
+        self.ts_before = time.time()
+        self.lsn_before = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
+        log.info(f"{self.desc_before}, lsn_before={self.lsn_before}")
+
+    def __exit__(self, *exc):
+        lsn_after = Lsn(self.ep.safe_psql_scalar("select pg_current_wal_lsn()"))
+        log.info(
+            f"inserted {((lsn_after - self.lsn_before) / 1024 / 1024):.3f} MB of WAL in {(time.time() - self.ts_before):.3f}s"
+        )
+
+
 # Test that we can create timeline with one safekeeper down and initialize it
-# later when some data already had been written.
+# later when some data already had been written. It is strictly weaker than
+# test_lagging_sk, but also is the simplest test to trigger WAL sk -> compute
+# download (recovery) and as such useful for development/testing.
 def test_late_init(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
@@ -1010,12 +1043,13 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
     sk1 = env.safekeepers[0]
     sk1.stop()
 
-    # create and insert smth while safekeeper is down...
-    env.neon_cli.create_branch("test_late_init")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_late_init")
     endpoint = env.endpoints.create_start("test_late_init")
+    # create and insert smth while safekeeper is down...
     endpoint.safe_psql("create table t(key int, value text)")
-    endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
-    log.info("insert with safekeeper down done")
+    with WalChangeLogger(endpoint, "doing insert with sk1 down"):
+        endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
     endpoint.stop()  # stop compute
 
     # stop another safekeeper, and start one which missed timeline creation
@@ -1024,28 +1058,213 @@ def test_late_init(neon_env_builder: NeonEnvBuilder):
     sk1.start()
 
     # insert some more
-    endpoint = env.endpoints.create_start("test_late_init")
+    with DurationLogger("recovery"):
+        endpoint = env.endpoints.create_start("test_late_init")
     endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
 
+    wait_flush_lsn_align_by_ep(
+        env, "test_late_init", tenant_id, timeline_id, endpoint, [sk1, env.safekeepers[2]]
+    )
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, env.safekeepers[2]], tenant_id, timeline_id)
+
 
 # is timeline flush_lsn equal on provided safekeepers?
-def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
-    status1 = sk1_http_cli.timeline_status(tenant_id, timeline_id)
-    status2 = sk2_http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(
-        f"waiting for flush_lsn alignment, sk1.flush_lsn={status1.flush_lsn}, sk2.flush_lsn={status2.flush_lsn}"
+def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
+    flush_lsns = [
+        sk_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
+        for sk_http_cli in sk_http_clis
+    ]
+    log.info(f"waiting for flush_lsn alignment, flush_lsns={flush_lsns}")
+    return all([flush_lsns[0] == flsn for flsn in flush_lsns])
+
+
+def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId):
+    status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
+    return len(status.walreceivers) == 0
+
+
+# Assert by xxd that WAL on given safekeepers is identical. No compute must be
+# running for this to be reliable.
+def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId):
+    assert len(sks) >= 2, "cmp_sk_wal makes sense with >= 2 safekeepers passed"
+    sk_http_clis = [sk.http_client() for sk in sks]
+
+    # First check that term / flush_lsn are the same: it is easier to
+    # report/understand if WALs are different due to that.
+    statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis]
+    term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses]
+    for tfl, sk in zip(term_flush_lsns[1:], sks[1:]):
+        assert (
+            term_flush_lsns[0] == tfl
+        ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}"
+
+    # check that WALs are identic.
+    segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks]
+    for cmp_segs, sk in zip(segs[1:], sks[1:]):
+        assert (
+            segs[0] == cmp_segs
+        ), f"lists of segments on sks {sks[0].id} and {sk.id} are not identic: {segs[0]} and {cmp_segs}"
+    log.info(f"comparing segs {segs[0]}")
+
+    sk0 = sks[0]
+    for sk in sks[1:]:
+        (_, mismatch, not_regular) = filecmp.cmpfiles(
+            sk0.timeline_dir(tenant_id, timeline_id),
+            sk.timeline_dir(tenant_id, timeline_id),
+            segs[0],
+            shallow=False,
+        )
+        log.info(
+            f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
+        )
+
+        for f in mismatch:
+            f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f)
+            f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f)
+            stdout_filename = "{}.filediff".format(f2)
+
+            with open(stdout_filename, "w") as stdout_f:
+                subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
+                subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
+
+                cmd = "diff {}.hex {}.hex".format(f1, f2)
+                subprocess.run([cmd], stdout=stdout_f, shell=True)
+
+            assert (mismatch, not_regular) == (
+                [],
+                [],
+            ), f"WAL segs {f1} and {f2} on sks {sks[0].id} and {sk.id} are not identic"
+
+
+# Wait until flush_lsn on given sks becomes equal, assuming endpoint ep is
+# running. ep is stopped by this function. This is used in tests which check
+# binary equality of WAL segments on safekeepers; which is inherently racy as
+# shutting down endpoint might always write some WAL which can get to only one
+# safekeeper. So here we recheck flush_lsn again after ep shutdown and retry if
+# it has changed.
+def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks):
+    sk_http_clis = [sk.http_client() for sk in sks]
+    # First wait for the alignment.
+    wait(
+        partial(is_flush_lsn_aligned, sk_http_clis, tenant_id, timeline_id),
+        "flush_lsn to get aligned",
     )
-    return status1.flush_lsn == status2.flush_lsn
+    ep.stop()  # then stop endpoint
+    # Even if there is no compute, there might be some in flight data; ensure
+    # all walreceivers die before rechecking.
+    for sk_http_cli in sk_http_clis:
+        wait(
+            partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id),
+            "walreceivers to be gone",
+        )
+    # Now recheck again flush_lsn and exit if it is good
+    if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id):
+        return
+    # Otherwise repeat.
+    log.info("flush_lsn changed during endpoint shutdown; retrying alignment")
+    ep = env.endpoints.create_start(branch)
 
 
-# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
-# 1) walproposer can't recover node if it misses WAL written by previous computes, but
-#    still starts up and functions normally if two other sks are ok.
-# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
-#    normally if two other sks are ok.
-# 3) Lagged safekeeper can still recover by peer recovery.
-def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
-    pass
+# Test behaviour with one safekeeper down and missing a lot of WAL, exercising
+# neon_walreader and checking that pg_wal never bloats. Namely, ensures that
+# compute doesn't keep many WAL for lagging sk, but still can recover it with
+# neon_walreader, in two scenarious: a) WAL never existed on compute (it started
+# on basebackup LSN later than lagging sk position) though segment file exists
+# b) WAL had been recycled on it and segment file doesn't exist.
+#
+# Also checks along the way that whenever there are two sks alive, compute
+# should be able to commit.
+def test_lagging_sk(neon_env_builder: NeonEnvBuilder):
+    # inserts ~20MB of WAL, a bit more than a segment.
+    def fill_segment(ep):
+        ep.safe_psql("insert into t select generate_series(1, 180000), 'payload'")
+
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    (sk1, sk2, sk3) = env.safekeepers
+
+    # create and insert smth while safekeeper is down...
+    sk1.stop()
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_lagging_sk")
+    ep = env.endpoints.create_start("test_lagging_sk")
+    ep.safe_psql("create table t(key int, value text)")
+    # make small insert to be on the same segment
+    ep.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
+    log.info("insert with safekeeper down done")
+    ep.stop()  # stop compute
+
+    # Stop another safekeeper, and start one which missed timeline creation.
+    sk2.stop()
+    sk1.start()
+
+    # Start new ep and insert some more. neon_walreader should download WAL for
+    # sk1 because it should be filled since the horizon (initial LSN) which is
+    # earlier than basebackup LSN.
+    ep = env.endpoints.create_start("test_lagging_sk")
+    ep.safe_psql("insert into t select generate_series(1,100), 'payload'")
+    # stop ep and ensure WAL is identical after recovery.
+    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
+
+    # Now repeat insertion with sk1 down, but with inserting more data to check
+    # that WAL on compute is removed.
+    sk1.stop()
+    sk2.start()
+
+    # min_wal_size must be at least 2x segment size.
+    min_wal_config = [
+        "min_wal_size=32MB",
+        "max_wal_size=32MB",
+        "wal_keep_size=0",
+        "log_checkpoints=on",
+    ]
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=min_wal_config,
+    )
+    with WalChangeLogger(ep, "doing large insert with sk1 down"):
+        for _ in range(0, 5):
+            fill_segment(ep)
+    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
+    assert ep.get_pg_wal_size() < 16 * 2.5
+
+    sk2.stop()  # stop another sk to ensure sk1 and sk3 can work
+    sk1.start()
+    with DurationLogger("recovery"):
+        ep.safe_psql("insert into t select generate_series(1,100), 'payload'")  # forces recovery
+    # stop ep and ensure WAL is identical after recovery.
+    wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk1, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk3], tenant_id, timeline_id)
+
+    # Now do the same with different safekeeper sk2 down, and restarting ep
+    # before recovery (again scenario when recovery starts below basebackup_lsn,
+    # but multi segment now).
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
+    )
+    with WalChangeLogger(ep, "doing large insert with sk2 down"):
+        for _ in range(0, 5):
+            fill_segment(ep)
+    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
+    assert ep.get_pg_wal_size() < 16 * 2.5
+
+    ep.stop()
+    ep = env.endpoints.create_start(
+        "test_lagging_sk",
+        config_lines=min_wal_config,
+    )
+    sk2.start()
+    with DurationLogger("recovery"):
+        wait_flush_lsn_align_by_ep(env, "test_lagging_sk", tenant_id, timeline_id, ep, [sk2, sk3])
+    # Check that WALs are the same.
+    cmp_sk_wal([sk1, sk2, sk3], tenant_id, timeline_id)
 
 
 # Smaller version of test_one_sk_down testing peer recovery in isolation: that
@@ -1065,7 +1284,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     sk2_http_cli = sk2.http_client()
     # ensure tli gets created on sk1, peer recovery won't do that
     wait(
-        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
         "flush_lsn to get aligned",
     )
 
@@ -1087,7 +1306,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
 
     # wait a bit, lsns shouldn't change
-    # time.sleep(5)
+    time.sleep(2)
     sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
     sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
     log.info(
@@ -1098,37 +1317,11 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
     # now restart safekeeper with peer recovery enabled and wait for recovery
     sk1.stop().start(extra_opts=["--peer-recovery=true"])
     wait(
-        partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
+        partial(is_flush_lsn_aligned, [sk1_http_cli, sk2_http_cli], tenant_id, timeline_id),
         "flush_lsn to get aligned",
     )
 
-    # check that WALs are identic after recovery
-    segs = sk1.list_segments(tenant_id, timeline_id)
-    log.info(f"segs are {segs}")
-
-    (_, mismatch, not_regular) = filecmp.cmpfiles(
-        sk1.timeline_dir(tenant_id, timeline_id),
-        sk2.timeline_dir(tenant_id, timeline_id),
-        segs,
-        shallow=False,
-    )
-    log.info(
-        f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
-    )
-
-    for f in mismatch:
-        f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
-        f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
-        stdout_filename = "{}.filediff".format(f2)
-
-        with open(stdout_filename, "w") as stdout_f:
-            subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
-            subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
-
-            cmd = "diff {}.hex {}.hex".format(f1, f2)
-            subprocess.run([cmd], stdout=stdout_f, shell=True)
-
-    assert (mismatch, not_regular) == ([], [])
+    cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
 
     # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
     env.safekeepers[2].stop()
@@ -1364,60 +1557,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
 
-# We have `wal_keep_size=0`, so postgres should trim WAL once it's broadcasted
-# to all safekeepers. This test checks that compute WAL can fit into small number
-# of WAL segments.
-def test_wal_deleted_after_broadcast(neon_env_builder: NeonEnvBuilder):
-    # used to calculate delta in collect_stats
-    last_lsn = Lsn(0)
-
-    # returns pg_wal size in MB
-    def collect_stats(endpoint: Endpoint, cur, enable_logs=True):
-        nonlocal last_lsn
-        assert endpoint.pgdata_dir is not None
-
-        log.info("executing INSERT to generate WAL")
-        current_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        pg_wal_size_mb = get_dir_size(os.path.join(endpoint.pgdata_dir, "pg_wal")) / 1024 / 1024
-        if enable_logs:
-            lsn_delta_mb = (current_lsn - last_lsn) / 1024 / 1024
-            log.info(f"LSN delta: {lsn_delta_mb} MB, current WAL size: {pg_wal_size_mb} MB")
-        last_lsn = current_lsn
-        return pg_wal_size_mb
-
-    # generates about ~20MB of WAL, to create at least one new segment
-    def generate_wal(cur):
-        cur.execute("INSERT INTO t SELECT generate_series(1,300000), 'payload'")
-
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    env.neon_cli.create_branch("test_wal_deleted_after_broadcast")
-    # Adjust checkpoint config to prevent keeping old WAL segments
-    endpoint = env.endpoints.create_start(
-        "test_wal_deleted_after_broadcast",
-        config_lines=["min_wal_size=32MB", "max_wal_size=32MB", "log_checkpoints=on"],
-    )
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-    cur.execute("CREATE TABLE t(key int, value text)")
-
-    collect_stats(endpoint, cur)
-
-    # generate WAL to simulate normal workload
-    for _ in range(5):
-        generate_wal(cur)
-        collect_stats(endpoint, cur)
-
-    log.info("executing checkpoint")
-    cur.execute("CHECKPOINT")
-    wal_size_after_checkpoint = collect_stats(endpoint, cur)
-
-    # there shouldn't be more than 2 WAL segments (but dir may have archive_status files)
-    assert wal_size_after_checkpoint < 16 * 2.5
-
-
 @pytest.mark.parametrize("auth_enabled", [False, True])
 def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
     neon_env_builder.auth_enabled = auth_enabled
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 0bb356aa0c..03358bb0b5 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 0bb356aa0cd1582112926fbcf0b5370222c2db6d
+Subproject commit 03358bb0b5e0d33c238710139e768db9e75cfcc8
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 24333abb81..a2dc225ddf 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 24333abb81a9ecae4541019478f0bf7d0b289df7
+Subproject commit a2dc225ddfc8cae1849aa2316f435c58f0333d8c
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 863b71572b..225071f482 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 863b71572bc441581efb3bbee2ad18af037be1bb
+Subproject commit 225071f482774943854c2eec4540757e01171557
diff --git a/vendor/revisions.json b/vendor/revisions.json
index a9575a2cb7..def4eab069 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-    "postgres-v16": "863b71572bc441581efb3bbee2ad18af037be1bb",
-    "postgres-v15": "24333abb81a9ecae4541019478f0bf7d0b289df7",
-    "postgres-v14": "0bb356aa0cd1582112926fbcf0b5370222c2db6d"
+    "postgres-v16": "225071f482774943854c2eec4540757e01171557",
+    "postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c",
+    "postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8"
 }
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 804405293f..68be0b3617 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -36,6 +36,7 @@ files:
       max_client_conn=10000
       default_pool_size=64
       max_prepared_statements=0
+      admin_users=cloud_admin
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 4621a75c0b..4f13064088 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -39,6 +39,7 @@ futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -50,13 +51,14 @@ nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128"] }
+once_cell = { version = "1" }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] }
-ring = { version = "0.16", features = ["std"] }
+ring = { version = "0.16" }
 rustls = { version = "0.21", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
@@ -75,8 +77,8 @@ tracing-core = { version = "0.1" }
 tungstenite = { version = "0.20" }
 url = { version = "2", features = ["serde"] }
 uuid = { version = "1", features = ["serde", "v4"] }
-zstd = { version = "0.12" }
-zstd-safe = { version = "6", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
+zstd = { version = "0.13" }
+zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }
 
 [build-dependencies]
@@ -84,11 +86,13 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
+getrandom = { version = "0.2", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
+once_cell = { version = "1" }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }