pageserver: work around #9185 in layer visibility calculation

pageserver: stabilize & refine controller scale test (#8971 )
## Problem We were seeing timeouts on migrations in this test. The test unfortunately tends to saturate local storage, which is shared between the pageservers and the control plane database, which makes the test kind of unrealistic. We will also want to increase the scale of this test, so it's worth fixing that. ## Summary of changes - Instead of randomly creating timelines at the same time as the other background operations, explicitly identify a subset of tenant which will have timelines, and create them at the start. This avoids pageservers putting a lot of load on the test node during the main body of the test. - Adjust the tenants created to create some number of 8 shard tenants and the rest 1 shard tenants, instead of just creating a lot of 2 shard tenants. - Use archival_config to exercise tenant-mutating operations, instead of using timeline creation for this. - Adjust reconcile_until_idle calls to avoid waiting 5 seconds between calls, which causes timelines with large shard count tenants. - Fix a pageserver bug where calls to archival_config during activation get 404
2026-05-19 06:00:38 +00:00 · 2024-10-15 15:18:31 +00:00 · 2024-10-15 09:31:18 +01:00 · 2024-10-14 21:12:43 +01:00 · 2024-10-14 20:30:21 +02:00 · 2024-10-14 17:54:03 +02:00
138 changed files with 4223 additions and 1878 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -183,7 +183,7 @@ runs:
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

    - name: Store Allure test stat in the DB (new)
      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -88,7 +88,7 @@ runs:
      uses: actions/cache@v4
      with:
        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

    - name: Install Python deps
      shell: bash -euxo pipefail {0}
@@ -218,6 +218,9 @@ runs:
        name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
        path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
+        # The lack of compatibility snapshot shouldn't fail the job
+        # (for example if we didn't run the test for non build-and-test workflow)
+        skip-if-does-not-exist: true

    - name: Upload test results
      if: ${{ !cancelled() }}
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -7,6 +7,10 @@ inputs:
  path:
    description: "A directory or file to upload"
    required: true
+  skip-if-does-not-exist:
+    description: "Allow to skip if path doesn't exist, fail otherwise"
+    default: false
+    required: false
  prefix:
    description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
    required: false
@@ -15,10 +19,12 @@ runs:
  using: "composite"
  steps:
    - name: Prepare artifact
+      id: prepare-artifact
      shell: bash -euxo pipefail {0}
      env:
        SOURCE: ${{ inputs.path }}
        ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
+        SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
      run: |
        mkdir -p $(dirname $ARCHIVE)

@@ -33,14 +39,22 @@ runs:
        elif [ -f ${SOURCE} ]; then
          time tar -cf ${ARCHIVE} --zstd ${SOURCE}
        elif ! ls ${SOURCE} > /dev/null 2>&1; then
-          echo >&2 "${SOURCE} does not exist"
-          exit 2
+          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
+            echo 'SKIPPED=true' >> $GITHUB_OUTPUT
+            exit 0
+          else
+            echo >&2 "${SOURCE} does not exist"
+            exit 2
+          fi
        else
          echo >&2 "${SOURCE} is neither a directory nor a file, do not know how to handle it"
          exit 3
        fi

+        echo 'SKIPPED=false' >> $GITHUB_OUTPUT
+
    - name: Upload artifact
+      if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }}
      shell: bash -euxo pipefail {0}
      env:
        SOURCE: ${{ inputs.path }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -124,28 +124,28 @@ jobs:
        uses: actions/cache@v4
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v4
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
        uses: actions/cache@v4
        with:
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v17 build
        id: cache_pg_17
        uses: actions/cache@v4
        with:
          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -19,9 +19,16 @@ defaults:
  run:
    shell: bash -euo pipefail {0}

-concurrency:
-  group: build-build-tools-image-${{ inputs.image-tag }}
-  cancel-in-progress: false
+# The initial idea was to prevent the waste of resources by not re-building the `build-tools` image
+# for the same tag in parallel workflow runs, and queue them to be skipped once we have
+# the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected.
+# GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs.
+#
+# Ref https://github.com/orgs/community/discussions/41518
+#
+# concurrency:
+#   group: build-build-tools-image-${{ inputs.image-tag }}
+#   cancel-in-progress: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -36,6 +43,7 @@ jobs:

    strategy:
      matrix:
+        debian-version: [ bullseye, bookworm ]
        arch: [ x64, arm64 ]

    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -74,22 +82,22 @@ jobs:

      - uses: docker/build-push-action@v6
        with:
+          file: Dockerfile.build-tools
          context: .
          provenance: false
          push: true
          pull: true
-          file: Dockerfile.build-tools
-          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
-          tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
+          build-args: |
+            DEBIAN_VERSION=${{ matrix.debian-version }}
+          cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }}
+          tags: |
+            neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }}

  merge-images:
    needs: [ build-image ]
    runs-on: ubuntu-22.04

-    env:
-      IMAGE_TAG: ${{ inputs.image-tag }}
-
    steps:
      - uses: docker/login-action@v3
        with:
@@ -97,7 +105,17 @@ jobs:
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

      - name: Create multi-arch image
+        env:
+          DEFAULT_DEBIAN_VERSION: bullseye
+          IMAGE_TAG: ${{ inputs.image-tag }}
        run: |
-          docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \
-                                             neondatabase/build-tools:${IMAGE_TAG}-x64 \
-                                             neondatabase/build-tools:${IMAGE_TAG}-arm64
+          for debian_version in bullseye bookworm; do
+            tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}")
+            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+              tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}")
+            fi
+
+            docker buildx imagetools create "${tags[@]}" \
+                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \
+                                              neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64
+          done
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -92,7 +92,7 @@ jobs:
    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -106,7 +106,7 @@ jobs:
        uses: actions/cache@v4
        with:
          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

      - name: Install Python deps
        run: ./scripts/pysync
@@ -181,7 +181,7 @@ jobs:
    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -193,16 +193,15 @@ jobs:
        with:
          submodules: true

-#      Disabled for now
-#      - name: Restore cargo deps cache
-#        id: cache_cargo
-#        uses: actions/cache@v4
-#        with:
-#          path: |
-#            !~/.cargo/registry/src
-#            ~/.cargo/git/
-#            target/
-#          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+      - name: Cache cargo deps
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            !~/.cargo/registry/src
+            ~/.cargo/git
+            target
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust

      # Some of our rust modules use FFI and need those to be checked
      - name: Get postgres headers
@@ -262,7 +261,7 @@ jobs:
    uses: ./.github/workflows/_build-and-test-locally.yml
    with:
      arch: ${{ matrix.arch }}
-      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
@@ -277,7 +276,7 @@ jobs:
    needs: [ check-permissions, build-build-tools-image ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -290,7 +289,7 @@ jobs:
        uses: actions/cache@v4
        with:
          path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

      - name: Install Python deps
        run: ./scripts/pysync
@@ -310,7 +309,7 @@ jobs:
    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -368,7 +367,7 @@ jobs:

    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -416,7 +415,7 @@ jobs:
    needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
    runs-on: [ self-hosted, small ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -560,15 +559,16 @@ jobs:
            ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }}
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm
+            DEBIAN_VERSION=bookworm
          provenance: false
          push: true
          pull: true
          file: Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }}
          tags: |
-            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }}

  neon-image:
    needs: [ neon-image-arch, tag ]
@@ -583,8 +583,9 @@ jobs:
      - name: Create multi-arch image
        run: |
          docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64
+                                          -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \
+                                             neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64

      - uses: docker/login-action@v3
        with:
@@ -605,17 +606,16 @@ jobs:
        version:
          # Much data was already generated on old PG versions with bullseye's
          # libraries, the locales of which can cause data incompatibilities.
-          # However, new PG versions should check if they can be built on newer
-          # images, as that reduces the support burden of old and ancient
-          # distros.
+          # However, new PG versions should be build on newer images,
+          # as that reduces the support burden of old and ancient distros.
          - pg: v14
-            debian: bullseye-slim
+            debian: bullseye
          - pg: v15
-            debian: bullseye-slim
+            debian: bullseye
          - pg: v16
-            debian: bullseye-slim
+            debian: bullseye
          - pg: v17
-            debian: bookworm-slim
+            debian: bookworm
        arch: [ x64, arm64 ]

    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -660,16 +660,16 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
          pull: true
          file: compute/Dockerfile.compute-node
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
-            neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}

      - name: Build neon extensions test image
        if: matrix.version.pg == 'v16'
@@ -680,17 +680,17 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
          pull: true
          file: compute/Dockerfile.compute-node
          target: neon-pg-ext-test
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
-            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
+            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}

      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
@@ -705,14 +705,16 @@ jobs:
          build-args: |
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
-            DEBIAN_FLAVOR=${{ matrix.version.debian }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
          pull: true
          file: compute/Dockerfile.compute-node
+          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}

  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
@@ -720,7 +722,16 @@ jobs:

    strategy:
      matrix:
-        version: [ v14, v15, v16, v17 ]
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+            debian: bullseye
+          - pg: v15
+            debian: bullseye
+          - pg: v16
+            debian: bullseye
+          - pg: v17
+            debian: bookworm

    steps:
      - uses: docker/login-action@v3
@@ -730,23 +741,26 @@ jobs:

      - name: Create multi-arch compute-node image
        run: |
-          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+          docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                          -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

      - name: Create multi-arch neon-test-extensions image
-        if: matrix.version == 'v16'
+        if: matrix.version.pg == 'v16'
        run: |
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                          -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

      - name: Create multi-arch compute-tools image
-        if: matrix.version == 'v17'
+        if: matrix.version.pg == 'v16'
        run: |
          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64
+                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

      - uses: docker/login-action@v3
        with:
@@ -754,13 +768,13 @@ jobs:
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          password: ${{ secrets.AWS_SECRET_KEY_DEV }}

-      - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR
+      - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR
        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version == 'v17'
+        if: matrix.version.pg == 'v16'
        run: |
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
@@ -771,7 +785,16 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        version: [ v14, v15, v16, v17 ]
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+            debian: bullseye
+          - pg: v15
+            debian: bullseye
+          - pg: v16
+            debian: bullseye
+          - pg: v17
+            debian: bookworm
    env:
      VM_BUILDER_VERSION: v0.35.0

@@ -793,18 +816,18 @@ jobs:
      # it won't have the proper authentication (written at v0.6.0)
      - name: Pulling compute-node image
        run: |
-          docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

      - name: Build vm image
        run: |
          ./vm-builder \
-            -spec=compute/vm-image-spec.yaml \
-            -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+            -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
+            -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

      - name: Pushing vm-compute-node image
        run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

  test-images:
    needs: [ check-permissions, tag, neon-image, compute-node-image ]
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -155,7 +155,7 @@ jobs:
      github.ref_name == 'main'
    runs-on: [ self-hosted, large ]
    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -55,7 +55,7 @@ jobs:
    runs-on: ubuntu-22.04

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -150,7 +150,7 @@ jobs:
    runs-on: ubuntu-22.04

    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -71,7 +71,6 @@ jobs:

    steps:
      - uses: docker/login-action@v3
-
        with:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -94,8 +93,22 @@ jobs:
          az acr login --name=neoneastus2

      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
+        env:
+          DEFAULT_DEBIAN_VERSION: bullseye
        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
-                                          -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
-                                          -t neondatabase/build-tools:${TO_TAG} \
-                                             neondatabase/build-tools:${FROM_TAG}
+          for debian_version in bullseye bookworm; do
+            tags=()
+
+            tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
+            tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
+
+            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
+              tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
+              tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}")
+              tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
+            fi
+
+            docker buildx imagetools create "${tags[@]}" \
+                                              neondatabase/build-tools:${FROM_TAG}-${debian_version}
+          done
--- a/.github/workflows/report-workflow-stats.yml
+++ b/.github/workflows/report-workflow-stats.yml
@@ -0,0 +1,41 @@
+name: Report Workflow Stats
+
+on:
+  workflow_run:
+    workflows:
+    - Add `external` label to issues and PRs created by external users
+    - Benchmarking
+    - Build and Test
+    - Build and Test Locally
+    - Build build-tools image
+    - Check Permissions
+    - Check build-tools image
+    - Check neon with extra platform builds
+    - Cloud Regression Test
+    - Create Release Branch
+    - Handle `approved-for-ci-run` label
+    - Lint GitHub Workflows
+    - Notify Slack channel about upcoming release
+    - Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+    - Pin build-tools image
+    - Prepare benchmarking databases by restoring dumps
+    - Push images to ACR
+    - Test Postgres client libraries
+    - Trigger E2E Tests
+    - cleanup caches by a branch
+    types: [completed]
+
+jobs:
+  gh-workflow-stats:
+    name: Github Workflow Stats
+    runs-on: ubuntu-22.04
+    permissions:
+      actions: read
+    steps:
+    - name: Export GH Workflow Stats
+      uses: neondatabase/gh-workflow-stats-action@v0.1.4
+      with:
+        DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        DB_TABLE: "gh_workflow_stats_neon"
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        GH_RUN_ID: ${{ github.event.workflow_run.id }}
--- a/1
+++ b/1
@@ -1,5 +1,6 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /storage_controller @neondatabase/storage
+/storage_scrubber @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
 /libs/remote_storage/ @neondatabase/storage
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1820,6 +1820,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
 dependencies = [
 "base16ct 0.2.0",
+ "base64ct",
 "crypto-bigint 0.5.5",
 "digest",
 "ff 0.13.0",
@@ -1829,6 +1830,8 @@ dependencies = [
 "pkcs8 0.10.2",
 "rand_core 0.6.4",
 "sec1 0.7.3",
+ "serde_json",
+ "serdect",
 "subtle",
 "zeroize",
 ]
@@ -4037,6 +4040,8 @@ dependencies = [
 "bytes",
 "fallible-iterator",
 "postgres-protocol",
+ "serde",
+ "serde_json",
 ]

 [[package]]
@@ -5256,6 +5261,7 @@ dependencies = [
 "der 0.7.8",
 "generic-array",
 "pkcs8 0.10.2",
+ "serdect",
 "subtle",
 "zeroize",
 ]
@@ -5510,6 +5516,16 @@ dependencies = [
 "syn 2.0.52",
 ]

+[[package]]
+name = "serdect"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a84f14a19e9a014bb9f4512488d9829a68e04ecabffb0f9904cd1ace94598177"
+dependencies = [
+ "base16ct 0.2.0",
+ "serde",
+]
+
 [[package]]
 name = "sha1"
 version = "0.10.5"
@@ -7302,6 +7318,7 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
+ "postgres-types",
 "prettyplease",
 "proc-macro2",
 "prost",
@@ -7326,6 +7343,7 @@ dependencies = [
 "time",
 "time-macros",
 "tokio",
+ "tokio-postgres",
 "tokio-stream",
 "tokio-util",
 "toml_edit",
--- a/4
+++ b/4
@@ -7,6 +7,8 @@ ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG DEFAULT_PG_VERSION=17
 ARG STABLE_PG_VERSION=16
+ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim

 # Build Postgres
 FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
@@ -57,7 +59,7 @@ RUN set -e \

 # Build final image
 #
-FROM debian:bullseye-slim
+FROM debian:${DEBIAN_FLAVOR}
 ARG DEFAULT_PG_VERSION
 WORKDIR /data

--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,12 +1,7 @@
-FROM debian:bullseye-slim
+ARG DEBIAN_VERSION=bullseye

-# Use ARG as a build-time environment variable here to allow.
-# It's not supposed to be set outside.
-# Alternatively it can be obtained using the following command
-# ```
-# . /etc/os-release && echo "${VERSION_CODENAME}"
-# ```
-ARG DEBIAN_VERSION_CODENAME=bullseye
+FROM debian:${DEBIAN_VERSION}-slim
+ARG DEBIAN_VERSION

 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
@@ -42,14 +37,14 @@ RUN set -e \
        libseccomp-dev \
        libsqlite3-dev \
        libssl-dev \
-        libstdc++-10-dev \
+        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
        libtool \
        libxml2-dev \
        libxmlsec1-dev \
        libxxhash-dev \
        lsof \
        make \
-        netcat \
+        netcat-openbsd \
        net-tools \
        openssh-client \
        parallel \
@@ -78,7 +73,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
    && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
    && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
@@ -86,7 +81,7 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

 # Install docker
 RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
-    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \
    && apt update \
    && apt install -y docker-ce docker-ce-cli \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -3,7 +3,8 @@ ARG REPOSITORY=neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
 ARG BUILD_TAG
-ARG DEBIAN_FLAVOR=bullseye-slim
+ARG DEBIAN_VERSION=bullseye
+ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim

 #########################################################################################
 #
@@ -11,20 +12,23 @@ ARG DEBIAN_FLAVOR=bullseye-slim
 #
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR AS build-deps
-ARG DEBIAN_FLAVOR
+ARG DEBIAN_VERSION

-RUN case $DEBIAN_FLAVOR in \
+RUN case $DEBIAN_VERSION in \
      # Version-specific installs for Bullseye (PG14-PG16):
      # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
      # Install newer version (3.25) from backports.
-      bullseye*) \
+      bullseye) \
        echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
      ;; \
      # Version-specific installs for Bookworm (PG17):
-      bookworm*) \
+      bookworm) \
        VERSION_INSTALLS="cmake"; \
      ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
    esac && \
    apt update &&  \
    apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \
@@ -109,13 +113,30 @@ RUN apt update && \
    libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
    protobuf-c-compiler xsltproc

+
+# Postgis 3.5.0 requires SFCGAL 1.4+
+#
+# It would be nice to update all versions together, but we must solve the SFCGAL dependency first.
 # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
-RUN case "${PG_VERSION}" in "v17") \
-    mkdir -p /sfcgal && \
-    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
+# and also we must check backward compatibility with older versions of PostGIS.
+#
+# Use new version only for v17
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export SFCGAL_VERSION=1.4.1 \
+        export SFCGAL_CHECKSUM=1800c8a26241588f11cddcf433049e9b9aea902e923414d2ecef33a3295626c3 \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export SFCGAL_VERSION=1.3.10 \
+        export SFCGAL_CHECKSUM=4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
    esac && \
-    wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
-    echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
+    mkdir -p /sfcgal && \
+    wget https://gitlab.com/sfcgal/SFCGAL/-/archive/v${SFCGAL_VERSION}/SFCGAL-v${SFCGAL_VERSION}.tar.gz -O SFCGAL.tar.gz && \
+    echo "${SFCGAL_CHECKSUM} SFCGAL.tar.gz" | sha256sum --check && \
    mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
    cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
    DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -123,15 +144,27 @@ RUN case "${PG_VERSION}" in "v17") \

 ENV PATH="/usr/local/pgsql/bin:$PATH"

-RUN case "${PG_VERSION}" in "v17") \
-    echo "Postgis doensn't yet support PG17 (needs 3.4.3, if not higher)" && exit 0;; \
+# Postgis 3.5.0 supports v17
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export POSTGIS_VERSION=3.5.0 \
+        export POSTGIS_CHECKSUM=ca698a22cc2b2b3467ac4e063b43a28413f3004ddd505bdccdd74c56a647f510 \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export POSTGIS_VERSION=3.3.3 \
+        export POSTGIS_CHECKSUM=74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
    esac && \
-    wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
-    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
+    wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
+    echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    cd extensions/postgis && \
    make clean && \
@@ -152,11 +185,27 @@ RUN case "${PG_VERSION}" in "v17") \
    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+# Uses versioned libraries, i.e. libpgrouting-3.4
+# and may introduce function signature changes between releases
+# i.e. release 3.5.0 has new signature for pg_dijkstra function
+#
+# Use new version only for v17
+# last release v3.6.2 - Mar 30, 2024
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export PGROUTING_VERSION=3.6.2 \
+        export PGROUTING_CHECKSUM=f4a1ed79d6f714e52548eca3bb8e5593c6745f1bde92eb5fb858efd8984dffa2 \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export PGROUTING_VERSION=3.4.2 \
+        export PGROUTING_CHECKSUM=cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
    esac && \
-    wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
-    echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
+    wget https://github.com/pgRouting/pgrouting/archive/v${PGROUTING_VERSION}.tar.gz -O pgrouting.tar.gz && \
+    echo "${PGROUTING_CHECKSUM} pgrouting.tar.gz" | sha256sum --check && \
    mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \
    mkdir build && cd build && \
    cmake -DCMAKE_BUILD_TYPE=Release .. && \
@@ -215,10 +264,9 @@ FROM build-deps AS h3-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-        mkdir -p /h3/usr/ && \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
+# not version-specific
+# last release v4.1.0 - Jan 18, 2023
+RUN mkdir -p /h3/usr/ && \
    wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
    echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \
    mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \
@@ -229,10 +277,9 @@ RUN case "${PG_VERSION}" in "v17") \
    cp -R /h3/usr / && \
    rm -rf build

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+# not version-specific
+# last release v4.1.3 - Jul 26, 2023
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
@@ -251,11 +298,10 @@ FROM build-deps AS unit-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
-    echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \
+# not version-specific
+# last release 7.9 - Sep 15, 2024
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
+    echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -302,12 +348,10 @@ FROM build-deps AS pgjwt-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
-    echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \
+# not version-specific
+# doesn't use releases, last commit f3d82fd - Mar 2, 2023 
+RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
+    echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
@@ -342,10 +386,9 @@ FROM build-deps AS pg-hashids-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
+# not version-specific
+# last release v1.2.1 -Jan 12, 2018
+RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -405,10 +448,9 @@ FROM build-deps AS ip4r-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+# not version-specific
+# last release v2.4.2 - Jul 29, 2023
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -425,10 +467,9 @@ FROM build-deps AS prefix-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+# not version-specific
+# last release v1.2.10  - Jul 5, 2023
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -445,10 +486,9 @@ FROM build-deps AS hll-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+# not version-specific
+# last release v2.18 - Aug 29, 2023
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -659,11 +699,10 @@ FROM build-deps AS pg-roaringbitmap-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+# not version-specific
+# last release v0.5.4 - Jun 28, 2022
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions is not supported yet by pg_roaringbitmap. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
+RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -680,12 +719,27 @@ FROM build-deps AS pg-semver-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+# Release 0.40.0 breaks backward compatibility with previous versions
+# see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
+# Use new version only for v17
+#
+# last release v0.40.0 - Jul 22, 2024
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 is not supported yet by pg_semver. Quit" && exit 0;; \
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export SEMVER_VERSION=0.40.0 \
+        export SEMVER_CHECKSUM=3e50bcc29a0e2e481e7b6d2bc937cadc5f5869f55d983b5a1aafeb49f5425cfc \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export SEMVER_VERSION=0.32.1 \
+        export SEMVER_CHECKSUM=fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
    esac && \
-    wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \
-    echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \
+    wget https://github.com/theory/pg-semver/archive/refs/tags/v${SEMVER_VERSION}.tar.gz -O pg_semver.tar.gz && \
+    echo "${SEMVER_CHECKSUM} pg_semver.tar.gz" | sha256sum --check && \
    mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -1041,7 +1095,6 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
 #########################################################################################

 FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
-ARG DEBIAN_FLAVOR

 COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

@@ -1052,7 +1105,6 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu
 #########################################################################################

 FROM debian:$DEBIAN_FLAVOR AS pgbouncer
-ARG DEBIAN_FLAVOR
 RUN set -e \
    && apt-get update \
    && apt-get install --no-install-recommends -y \
@@ -1207,7 +1259,7 @@ ENV PGDATABASE=postgres
 #
 #########################################################################################
 FROM debian:$DEBIAN_FLAVOR
-ARG DEBIAN_FLAVOR
+ARG DEBIAN_VERSION
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    echo "postgres:test_console_pass" | chpasswd && \
@@ -1255,19 +1307,22 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca


 RUN apt update && \
-    case $DEBIAN_FLAVOR in \
+    case $DEBIAN_VERSION in \
      # Version-specific installs for Bullseye (PG14-PG16):
      # libicu67, locales for collations (including ICU and plpgsql_check)
      # libgdal28, libproj19 for PostGIS
-      bullseye*) \
+      bullseye) \
        VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \
      ;; \
      # Version-specific installs for Bookworm (PG17):
      # libicu72, locales for collations (including ICU and plpgsql_check)
      # libgdal32, libproj25 for PostGIS
-      bookworm*) \
+      bookworm) \
        VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \
      ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
    esac && \
    apt install --no-install-recommends -y \
        gdb \
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -0,0 +1,126 @@
+# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
+---
+commands:
+  - name: cgconfigparser
+    user: root
+    sysvInitAction: sysinit
+    shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
+  # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
+  # running it as root.
+  - name: chmod-resize-swap
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/resize-swap'
+  - name: chmod-set-disk-quota
+    user: root
+    sysvInitAction: sysinit
+    shell: 'chmod 711 /neonvm/bin/set-disk-quota'
+  - name: pgbouncer
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+  - name: local_proxy
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+  - name: postgres-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+  - name: sql-exporter
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
+  - name: sql-exporter-autoscaling
+    user: nobody
+    sysvInitAction: respawn
+    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+shutdownHook: |
+  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
+files:
+  - filename: compute_ctl-sudoers
+    content: |
+      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
+      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
+      # regardless of hostname (ALL)
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+  - filename: cgconfig.conf
+    content: |
+      # Configuration for cgroups in VM compute nodes
+      group neon-postgres {
+          perm {
+              admin {
+                  uid = postgres;
+              }
+              task {
+                  gid = users;
+              }
+          }
+          memory {}
+      }
+build: |
+  # Build cgroup-tools
+  #
+  # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
+  # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
+  # requires cgroup v2, so we'll build cgroup-tools ourselves.
+  #
+  # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2,
+  # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset
+  # for debian version migration.
+  #
+  FROM debian:bookworm-slim as libcgroup-builder
+  ENV LIBCGROUP_VERSION=v2.0.3
+
+  RUN set -exu \
+      && apt update \
+      && apt install --no-install-recommends -y \
+          git \
+          ca-certificates \
+          automake \
+          cmake \
+          make \
+          gcc \
+          byacc \
+          flex \
+          libtool \
+          libpam0g-dev \
+      && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
+      && INSTALL_DIR="/libcgroup-install" \
+      && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
+      && cd libcgroup \
+      # extracted from bootstrap.sh, with modified flags:
+      && (test -d m4 || mkdir m4) \
+      && autoreconf -fi \
+      && rm -rf autom4te.cache \
+      && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
+      # actually build the thing...
+      && make install
+merge: |
+  # tweak nofile limits
+  RUN set -e \
+      && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
+      && test ! -e /etc/security || ( \
+         echo '*    - nofile 1048576' >>/etc/security/limits.conf \
+      && echo 'root - nofile 1048576' >>/etc/security/limits.conf \
+         )
+
+  # Allow postgres user (compute_ctl) to run swap resizer.
+  # Need to install sudo in order to allow this.
+  #
+  # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
+  RUN set -e \
+      && apt update \
+      && apt install --no-install-recommends -y \
+             sudo \
+      && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
+
+  COPY cgconfig.conf /etc/cgconfig.conf
+
+  RUN set -e \
+      && chmod 0644 /etc/cgconfig.conf
+
+  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
+  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
+  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1484,6 +1484,28 @@ LIMIT 100",
            info!("Pageserver config changed");
        }
    }
+
+    // Gather info about installed extensions
+    pub fn get_installed_extensions(&self) -> Result<()> {
+        let connstr = self.connstr.clone();
+
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create runtime");
+        let result = rt
+            .block_on(crate::installed_extensions::get_installed_extensions(
+                connstr,
+            ))
+            .expect("failed to get installed extensions");
+
+        info!(
+            "{}",
+            serde_json::to_string(&result).expect("failed to serialize extensions list")
+        );
+
+        Ok(())
+    }
 }

 pub fn forward_termination_signal() {
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -165,6 +165,32 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        // get the list of installed extensions
+        // currently only used in python tests
+        // TODO: call it from cplane
+        (&Method::GET, "/installed_extensions") => {
+            info!("serving /installed_extensions GET request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for extensions request: {:?}",
+                    status
+                );
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
+            let connstr = compute.connstr.clone();
+            let res = crate::installed_extensions::get_installed_extensions(connstr).await;
+            match res {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(e) => render_json_error(
+                    &format!("could not get list of installed extensions: {}", e),
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                ),
+            }
+        }
+
        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -53,6 +53,20 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeInsights"

+  /installed_extensions:
+    get:
+      tags:
+      - Info
+      summary: Get installed extensions.
+      description: ""
+      operationId: getInstalledExtensions
+      responses:
+        200:
+          description: List of installed extensions
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/InstalledExtensions"
  /info:
    get:
      tags:
@@ -395,6 +409,24 @@ components:
        - configuration
      example: running

+    InstalledExtensions:
+      type: object
+      properties:
+        extensions:
+          description: Contains list of installed extensions.
+          type: array
+          items:
+            type: object
+            properties:
+              extname:
+                type: string
+              versions:
+                type: array
+                items:
+                  type: string
+              n_databases:
+                type: integer
+
    #
    # Errors
    #
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -0,0 +1,80 @@
+use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use std::collections::HashMap;
+use std::collections::HashSet;
+use url::Url;
+
+use anyhow::Result;
+use postgres::{Client, NoTls};
+use tokio::task;
+
+/// We don't reuse get_existing_dbs() just for code clarity
+/// and to make database listing query here more explicit.
+///
+/// Limit the number of databases to 500 to avoid excessive load.
+fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
+    // `pg_database.datconnlimit = -2` means that the database is in the
+    // invalid state
+    let databases = client
+        .query(
+            "SELECT datname FROM pg_catalog.pg_database
+                WHERE datallowconn
+                AND datconnlimit <> - 2
+                LIMIT 500",
+            &[],
+        )?
+        .iter()
+        .map(|row| {
+            let db: String = row.get("datname");
+            db
+        })
+        .collect();
+
+    Ok(databases)
+}
+
+/// Connect to every database (see list_dbs above) and get the list of installed extensions.
+/// Same extension can be installed in multiple databases with different versions,
+/// we only keep the highest and lowest version across all databases.
+pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {
+    let mut connstr = connstr.clone();
+
+    task::spawn_blocking(move || {
+        let mut client = Client::connect(connstr.as_str(), NoTls)?;
+        let databases: Vec<String> = list_dbs(&mut client)?;
+
+        let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+        for db in databases.iter() {
+            connstr.set_path(db);
+            let mut db_client = Client::connect(connstr.as_str(), NoTls)?;
+            let extensions: Vec<(String, String)> = db_client
+                .query(
+                    "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                    &[],
+                )?
+                .iter()
+                .map(|row| (row.get("extname"), row.get("extversion")))
+                .collect();
+
+            for (extname, v) in extensions.iter() {
+                let version = v.to_string();
+                extensions_map
+                    .entry(extname.to_string())
+                    .and_modify(|e| {
+                        e.versions.insert(version.clone());
+                        // count the number of databases where the extension is installed
+                        e.n_databases += 1;
+                    })
+                    .or_insert(InstalledExtension {
+                        extname: extname.to_string(),
+                        versions: HashSet::from([version.clone()]),
+                        n_databases: 1,
+                    });
+            }
+        }
+
+        Ok(InstalledExtensions {
+            extensions: extensions_map.values().cloned().collect(),
+        })
+    })
+    .await?
+}
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -15,6 +15,7 @@ pub mod catalog;
 pub mod compute;
 pub mod disk_quota;
 pub mod extension_server;
+pub mod installed_extensions;
 pub mod local_proxy;
 pub mod lsn_lease;
 mod migration;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -97,7 +97,21 @@ impl ComputeControlPlane {
        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
+            let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env);
+            let ep = match ep_res {
+                Ok(ep) => ep,
+                Err(e) => match e.downcast::<std::io::Error>() {
+                    Ok(e) => {
+                        // A parallel task could delete an endpoint while we have just scanned the directory
+                        if e.kind() == std::io::ErrorKind::NotFound {
+                            continue;
+                        } else {
+                            Err(e)?
+                        }
+                    }
+                    Err(e) => Err(e)?,
+                },
+            };
            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,5 +1,6 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.

+use std::collections::HashSet;
 use std::fmt::Display;

 use chrono::{DateTime, Utc};
@@ -155,3 +156,15 @@ pub enum ControlPlaneComputeStatus {
    // should be able to start with provided spec.
    Attached,
 }
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct InstalledExtension {
+    pub extname: String,
+    pub versions: HashSet<String>,
+    pub n_databases: u32, // Number of databases using this extension
+}
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct InstalledExtensions {
+    pub extensions: Vec<InstalledExtension>,
+}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,8 +104,7 @@ pub struct ConfigToml {
    pub image_compression: ImageCompressionAlgorithm,
    pub ephemeral_bytes_per_memory_kb: usize,
    pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
-    pub io_buffer_alignment: usize,
+    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -388,10 +387,7 @@ impl Default for ConfigToml {
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: None,
-            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
-
-            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
-
+            virtual_file_io_mode: None,
            tenant_config: TenantConfigToml::default(),
        }
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -972,8 +972,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::path::PathBuf;
-
    #[derive(
        Copy,
        Clone,
@@ -994,50 +992,45 @@ pub mod virtual_file {
    }

    /// Direct IO modes for a pageserver.
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-    pub enum DirectIoMode {
-        /// Direct IO disabled (uses usual buffered IO).
-        #[default]
-        Disabled,
-        /// Direct IO disabled (performs checks and perf simulations).
-        Evaluate {
-            /// Alignment check level
-            alignment_check: DirectIoAlignmentCheckLevel,
-            /// Latency padded for performance simulation.
-            latency_padding: DirectIoLatencyPadding,
-        },
-        /// Direct IO enabled.
-        Enabled {
-            /// Actions to perform on alignment error.
-            on_alignment_error: DirectIoOnAlignmentErrorAction,
-        },
+    #[derive(
+        Copy,
+        Clone,
+        PartialEq,
+        Eq,
+        Hash,
+        strum_macros::EnumString,
+        strum_macros::Display,
+        serde_with::DeserializeFromStr,
+        serde_with::SerializeDisplay,
+        Debug,
+    )]
+    #[strum(serialize_all = "kebab-case")]
+    #[repr(u8)]
+    pub enum IoMode {
+        /// Uses buffered IO.
+        Buffered,
+        /// Uses direct IO, error out if the operation fails.
+        #[cfg(target_os = "linux")]
+        Direct,
    }

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoAlignmentCheckLevel {
-        #[default]
-        Error,
-        Log,
-        None,
+    impl IoMode {
+        pub const fn preferred() -> Self {
+            Self::Buffered
+        }
    }

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoOnAlignmentErrorAction {
-        Error,
-        #[default]
-        FallbackToBuffered,
-    }
+    impl TryFrom<u8> for IoMode {
+        type Error = u8;

-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "type", rename_all = "kebab-case")]
-    pub enum DirectIoLatencyPadding {
-        /// Pad virtual file operations with IO to a fake file.
-        FakeFileRW { path: PathBuf },
-        #[default]
-        None,
+        fn try_from(value: u8) -> Result<Self, Self::Error> {
+            Ok(match value {
+                v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
+                #[cfg(target_os = "linux")]
+                v if v == (IoMode::Direct as u8) => IoMode::Direct,
+                x => return Err(x),
+            })
+        }
    }
 }

--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -496,26 +496,12 @@ impl RemoteStorage for AzureBlobStorage {
            builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()))
        }

-        self.download_for_builder(builder, cancel).await
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
-
-        let mut builder = blob_client.get();
-
-        let range: Range = if let Some(end_exclusive) = end_exclusive {
-            (start_inclusive..end_exclusive).into()
-        } else {
-            (start_inclusive..).into()
-        };
-        builder = builder.range(range);
+        if let Some((start, end)) = opts.byte_range() {
+            builder = builder.range(match end {
+                Some(end) => Range::Range(start..end),
+                None => Range::RangeFrom(start..),
+            });
+        }

        self.download_for_builder(builder, cancel).await
    }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,7 +19,8 @@ mod simulate_failures;
 mod support;

 use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime,
+    collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
+    time::SystemTime,
 };

 use anyhow::Context;
@@ -162,11 +163,60 @@ pub struct Listing {
 }

 /// Options for downloads. The default value is a plain GET.
-#[derive(Default)]
 pub struct DownloadOpts {
    /// If given, returns [`DownloadError::Unmodified`] if the object still has
    /// the same ETag (using If-None-Match).
    pub etag: Option<Etag>,
+    /// The start of the byte range to download, or unbounded.
+    pub byte_start: Bound<u64>,
+    /// The end of the byte range to download, or unbounded. Must be after the
+    /// start bound.
+    pub byte_end: Bound<u64>,
+}
+
+impl Default for DownloadOpts {
+    fn default() -> Self {
+        Self {
+            etag: Default::default(),
+            byte_start: Bound::Unbounded,
+            byte_end: Bound::Unbounded,
+        }
+    }
+}
+
+impl DownloadOpts {
+    /// Returns the byte range with inclusive start and exclusive end, or None
+    /// if unbounded.
+    pub fn byte_range(&self) -> Option<(u64, Option<u64>)> {
+        if self.byte_start == Bound::Unbounded && self.byte_end == Bound::Unbounded {
+            return None;
+        }
+        let start = match self.byte_start {
+            Bound::Excluded(i) => i + 1,
+            Bound::Included(i) => i,
+            Bound::Unbounded => 0,
+        };
+        let end = match self.byte_end {
+            Bound::Excluded(i) => Some(i),
+            Bound::Included(i) => Some(i + 1),
+            Bound::Unbounded => None,
+        };
+        if let Some(end) = end {
+            assert!(start < end, "range end {end} at or before start {start}");
+        }
+        Some((start, end))
+    }
+
+    /// Returns the byte range as an RFC 2616 Range header value with inclusive
+    /// bounds, or None if unbounded.
+    pub fn byte_range_header(&self) -> Option<String> {
+        self.byte_range()
+            .map(|(start, end)| (start, end.map(|end| end - 1))) // make end inclusive
+            .map(|(start, end)| match end {
+                Some(end) => format!("bytes={start}-{end}"),
+                None => format!("bytes={start}-"),
+            })
+    }
 }

 /// Storage (potentially remote) API to manage its state.
@@ -257,21 +307,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError>;

-    /// Streams a given byte range of the remote storage entry contents.
-    ///
-    /// The returned download stream will obey initial timeout and cancellation signal by erroring
-    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
-    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
-    ///
-    /// Returns the metadata, if any was stored with the file previously.
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError>;
-
    /// Delete a single path from remote storage.
    ///
    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -425,33 +460,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    pub async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        match self {
-            Self::LocalFs(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-            Self::AwsS3(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-            Self::AzureBlob(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-            Self::Unreliable(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-        }
-    }
-
    /// See [`RemoteStorage::delete`]
    pub async fn delete(
        &self,
@@ -573,20 +581,6 @@ impl GenericRemoteStorage {
            })
    }

-    /// Downloads the storage object into the `to_path` provided.
-    /// `byte_range` could be specified to dowload only a part of the file, if needed.
-    pub async fn download_storage_object(
-        &self,
-        byte_range: Option<(u64, Option<u64>)>,
-        from: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        match byte_range {
-            Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
-            None => self.download(from, &DownloadOpts::default(), cancel).await,
-        }
-    }
-
    /// The name of the bucket/container/etc.
    pub fn bucket_name(&self) -> Option<&str> {
        match self {
@@ -660,6 +654,76 @@ impl ConcurrencyLimiter {
 mod tests {
    use super::*;

+    /// DownloadOpts::byte_range() should generate (inclusive, exclusive) ranges
+    /// with optional end bound, or None when unbounded.
+    #[test]
+    fn download_opts_byte_range() {
+        // Consider using test_case or a similar table-driven test framework.
+        let cases = [
+            // (byte_start, byte_end, expected)
+            (Bound::Unbounded, Bound::Unbounded, None),
+            (Bound::Unbounded, Bound::Included(7), Some((0, Some(8)))),
+            (Bound::Unbounded, Bound::Excluded(7), Some((0, Some(7)))),
+            (Bound::Included(3), Bound::Unbounded, Some((3, None))),
+            (Bound::Included(3), Bound::Included(7), Some((3, Some(8)))),
+            (Bound::Included(3), Bound::Excluded(7), Some((3, Some(7)))),
+            (Bound::Excluded(3), Bound::Unbounded, Some((4, None))),
+            (Bound::Excluded(3), Bound::Included(7), Some((4, Some(8)))),
+            (Bound::Excluded(3), Bound::Excluded(7), Some((4, Some(7)))),
+            // 1-sized ranges are fine, 0 aren't and will panic (separate test).
+            (Bound::Included(3), Bound::Included(3), Some((3, Some(4)))),
+            (Bound::Included(3), Bound::Excluded(4), Some((3, Some(4)))),
+        ];
+
+        for (byte_start, byte_end, expect) in cases {
+            let opts = DownloadOpts {
+                byte_start,
+                byte_end,
+                ..Default::default()
+            };
+            let result = opts.byte_range();
+            assert_eq!(
+                result, expect,
+                "byte_start={byte_start:?} byte_end={byte_end:?}"
+            );
+
+            // Check generated HTTP header, which uses an inclusive range.
+            let expect_header = expect.map(|(start, end)| match end {
+                Some(end) => format!("bytes={start}-{}", end - 1), // inclusive end
+                None => format!("bytes={start}-"),
+            });
+            assert_eq!(
+                opts.byte_range_header(),
+                expect_header,
+                "byte_start={byte_start:?} byte_end={byte_end:?}"
+            );
+        }
+    }
+
+    /// DownloadOpts::byte_range() zero-sized byte range should panic.
+    #[test]
+    #[should_panic]
+    fn download_opts_byte_range_zero() {
+        DownloadOpts {
+            byte_start: Bound::Included(3),
+            byte_end: Bound::Excluded(3),
+            ..Default::default()
+        }
+        .byte_range();
+    }
+
+    /// DownloadOpts::byte_range() negative byte range should panic.
+    #[test]
+    #[should_panic]
+    fn download_opts_byte_range_negative() {
+        DownloadOpts {
+            byte_start: Bound::Included(3),
+            byte_end: Bound::Included(2),
+            ..Default::default()
+        }
+        .byte_range();
+    }
+
    #[test]
    fn test_object_name() {
        let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -506,54 +506,7 @@ impl RemoteStorage for LocalFs {
            return Err(DownloadError::Unmodified);
        }

-        let source = ReaderStream::new(
-            fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
-                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?,
-        );
-
-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        if let Some(end_exclusive) = end_exclusive {
-            if end_exclusive <= start_inclusive {
-                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})")));
-            };
-            if start_inclusive == end_exclusive.saturating_sub(1) {
-                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
-            }
-        }
-
-        let target_path = from.with_base(&self.storage_root);
-        let file_metadata = file_metadata(&target_path).await?;
-        let mut source = tokio::fs::OpenOptions::new()
+        let mut file = fs::OpenOptions::new()
            .read(true)
            .open(&target_path)
            .await
@@ -562,31 +515,29 @@ impl RemoteStorage for LocalFs {
            })
            .map_err(DownloadError::Other)?;

-        let len = source
-            .metadata()
-            .await
-            .context("query file length")
-            .map_err(DownloadError::Other)?
-            .len();
+        let mut take = file_metadata.len();
+        if let Some((start, end)) = opts.byte_range() {
+            if start > 0 {
+                file.seek(io::SeekFrom::Start(start))
+                    .await
+                    .context("Failed to seek to the range start in a local storage file")
+                    .map_err(DownloadError::Other)?;
+            }
+            if let Some(end) = end {
+                take = end - start;
+            }
+        }

-        source
-            .seek(io::SeekFrom::Start(start_inclusive))
-            .await
-            .context("Failed to seek to the range start in a local storage file")
-            .map_err(DownloadError::Other)?;
+        let source = ReaderStream::new(file.take(take));

        let metadata = self
            .read_storage_metadata(&target_path)
            .await
            .map_err(DownloadError::Other)?;

-        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-        let source = ReaderStream::new(source);
-
        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);

-        let etag = mock_etag(&file_metadata);
        Ok(Download {
            metadata,
            last_modified: file_metadata
@@ -688,7 +639,7 @@ mod fs_tests {
    use super::*;

    use camino_tempfile::tempdir;
-    use std::{collections::HashMap, io::Write};
+    use std::{collections::HashMap, io::Write, ops::Bound};

    async fn read_and_check_metadata(
        storage: &LocalFs,
@@ -804,10 +755,12 @@ mod fs_tests {
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

        let first_part_download = storage
-            .download_byte_range(
+            .download(
                &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
+                &DownloadOpts {
+                    byte_end: Bound::Excluded(first_part_local.len() as u64),
+                    ..Default::default()
+                },
                &cancel,
            )
            .await?;
@@ -823,10 +776,15 @@ mod fs_tests {
        );

        let second_part_download = storage
-            .download_byte_range(
+            .download(
                &upload_target,
-                first_part_local.len() as u64,
-                Some((first_part_local.len() + second_part_local.len()) as u64),
+                &DownloadOpts {
+                    byte_start: Bound::Included(first_part_local.len() as u64),
+                    byte_end: Bound::Excluded(
+                        (first_part_local.len() + second_part_local.len()) as u64,
+                    ),
+                    ..Default::default()
+                },
                &cancel,
            )
            .await?;
@@ -842,7 +800,14 @@ mod fs_tests {
        );

        let suffix_bytes = storage
-            .download_byte_range(&upload_target, 13, None, &cancel)
+            .download(
+                &upload_target,
+                &DownloadOpts {
+                    byte_start: Bound::Included(13),
+                    ..Default::default()
+                },
+                &cancel,
+            )
            .await?
            .download_stream;
        let suffix_bytes = aggregate(suffix_bytes).await?;
@@ -850,7 +815,7 @@ mod fs_tests {
        assert_eq!(upload_name, suffix);

        let all_bytes = storage
-            .download_byte_range(&upload_target, 0, None, &cancel)
+            .download(&upload_target, &DownloadOpts::default(), &cancel)
            .await?
            .download_stream;
        let all_bytes = aggregate(all_bytes).await?;
@@ -861,48 +826,26 @@ mod fs_tests {
    }

    #[tokio::test]
-    async fn download_file_range_negative() -> anyhow::Result<()> {
-        let (storage, cancel) = create_storage()?;
+    #[should_panic(expected = "at or before start")]
+    async fn download_file_range_negative() {
+        let (storage, cancel) = create_storage().unwrap();
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel)
+            .await
+            .unwrap();

-        let start = 1_000_000_000;
-        let end = start + 1;
-        match storage
-            .download_byte_range(
+        storage
+            .download(
                &upload_target,
-                start,
-                Some(end), // exclusive end
+                &DownloadOpts {
+                    byte_start: Bound::Included(10),
+                    byte_end: Bound::Excluded(10),
+                    ..Default::default()
+                },
                &cancel,
            )
            .await
-        {
-            Ok(_) => panic!("Should not allow downloading wrong ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("zero bytes"));
-                assert!(error_string.contains(&start.to_string()));
-                assert!(error_string.contains(&end.to_string()));
-            }
-        }
-
-        let start = 10000;
-        let end = 234;
-        assert!(start > end, "Should test an incorrect range");
-        match storage
-            .download_byte_range(&upload_target, start, Some(end), &cancel)
-            .await
-        {
-            Ok(_) => panic!("Should not allow downloading wrong ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("Invalid range"));
-                assert!(error_string.contains(&start.to_string()));
-                assert!(error_string.contains(&end.to_string()));
-            }
-        }
-
-        Ok(())
+            .unwrap();
    }

    #[tokio::test]
@@ -945,10 +888,12 @@ mod fs_tests {
        let (first_part_local, _) = uploaded_bytes.split_at(3);

        let partial_download_with_metadata = storage
-            .download_byte_range(
+            .download(
                &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
+                &DownloadOpts {
+                    byte_end: Bound::Excluded(first_part_local.len() as u64),
+                    ..Default::default()
+                },
                &cancel,
            )
            .await?;
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -804,34 +804,7 @@ impl RemoteStorage for S3Bucket {
                bucket: self.bucket_name.clone(),
                key: self.relative_path_to_s3_object(from),
                etag: opts.etag.as_ref().map(|e| e.to_string()),
-                range: None,
-            },
-            cancel,
-        )
-        .await
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
-        // and needs both ends to be exclusive
-        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
-        let range = Some(match end_inclusive {
-            Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
-            None => format!("bytes={start_inclusive}-"),
-        });
-
-        self.download_object(
-            GetObjectRequest {
-                bucket: self.bucket_name.clone(),
-                key: self.relative_path_to_s3_object(from),
-                etag: None,
-                range,
+                range: opts.byte_range_header(),
            },
            cancel,
        )
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -170,28 +170,13 @@ impl RemoteStorage for UnreliableWrapper {
        opts: &DownloadOpts,
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
+        // Note: We treat any byte range as an "attempt" of the same operation.
+        // We don't pay attention to the ranges. That's good enough for now.
        self.attempt(RemoteOp::Download(from.clone()))
            .map_err(DownloadError::Other)?;
        self.inner.download(from, opts, cancel).await
    }

-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        // Note: We treat any download_byte_range as an "attempt" of the same
-        // operation. We don't pay attention to the ranges. That's good enough
-        // for now.
-        self.attempt(RemoteOp::Download(from.clone()))
-            .map_err(DownloadError::Other)?;
-        self.inner
-            .download_byte_range(from, start_inclusive, end_exclusive, cancel)
-            .await
-    }
-
    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
        self.delete_inner(path, true, cancel).await
    }
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -2,6 +2,7 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures::StreamExt;
 use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath};
+use std::ops::Bound;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
@@ -293,7 +294,15 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
    // Full range (end specified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 0, Some(len as u64), &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(0),
+                byte_end: Bound::Excluded(len as u64),
+                ..Default::default()
+            },
+            &cancel,
+        )
        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);
@@ -301,7 +310,15 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
    // partial range (end specified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 4, Some(10), &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(4),
+                byte_end: Bound::Excluded(10),
+                ..Default::default()
+            },
+            &cancel,
+        )
        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig[4..10]);
@@ -309,7 +326,15 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
    // partial range (end beyond real end)
    let dl = ctx
        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(8),
+                byte_end: Bound::Excluded(len as u64 * 100),
+                ..Default::default()
+            },
+            &cancel,
+        )
        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig[8..]);
@@ -317,7 +342,14 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
    // Partial range (end unspecified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 4, None, &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(4),
+                ..Default::default()
+            },
+            &cancel,
+        )
        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig[4..]);
@@ -325,7 +357,14 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
    // Full range (end unspecified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 0, None, &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(0),
+                ..Default::default()
+            },
+            &cancel,
+        )
        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -31,9 +31,12 @@ pub enum Scope {
    /// The scope used by pageservers in upcalls to storage controller and cloud control plane
    #[serde(rename = "generations_api")]
    GenerationsApi,
-    /// Allows access to control plane managment API and some storage controller endpoints.
+    /// Allows access to control plane managment API and all storage controller endpoints.
    Admin,

+    /// Allows access to control plane & storage controller endpoints used in infrastructure automation (e.g. node registration)
+    Infra,
+
    /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
    /// of a tenant & post scrub results.
    Scrubber,
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -28,6 +28,9 @@ pub enum ApiError {
    #[error("Resource temporarily unavailable: {0}")]
    ResourceUnavailable(Cow<'static, str>),

+    #[error("Too many requests: {0}")]
+    TooManyRequests(Cow<'static, str>),
+
    #[error("Shutting down")]
    ShuttingDown,

@@ -73,6 +76,10 @@ impl ApiError {
                err.to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
+            ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::TOO_MANY_REQUESTS,
+            ),
            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::REQUEST_TIMEOUT,
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,11 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    let conf: &'static PageServerConf = Box::leak(Box::new(
        pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
    ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
    page_cache::init(conf.page_cache_size);

    {
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -540,10 +540,13 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

-    /// Configs io buffer alignment at runtime.
-    pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
-        let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
-        self.request(Method::PUT, uri, align)
+    /// Configs io mode at runtime.
+    pub async fn put_io_mode(
+        &self,
+        mode: &pageserver_api::models::virtual_file::IoMode,
+    ) -> Result<()> {
+        let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, mode)
            .await?
            .json()
            .await
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -152,11 +152,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    pageserver::page_cache::init(100);

    let mut total_delta_layers = 0usize;
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let file = VirtualFile::open(path, ctx).await?;
    let file_id = page_cache::next_file_id();
@@ -190,11 +190,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            new_tenant_id,
            new_timeline_id,
        } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            );
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
            pageserver::page_cache::init(100);

            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -26,7 +26,7 @@ use pageserver::{
    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
    virtual_file,
 };
-use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
@@ -205,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {

 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
    dump_layerfile_from_path(path, true, &ctx).await
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -59,9 +59,9 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,

-    /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
+    /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct).
    #[clap(long)]
-    set_io_alignment: Option<usize>,
+    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,

    targets: Option<Vec<TenantTimelineId>>,
 }
@@ -129,8 +129,8 @@ async fn main_impl(
        mgmt_api_client.put_io_engine(engine_str).await?;
    }

-    if let Some(align) = args.set_io_alignment {
-        mgmt_api_client.put_io_alignment(align).await?;
+    if let Some(mode) = &args.set_io_mode {
+        mgmt_api_client.put_io_mode(mode).await?;
    }

    // discover targets
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,14 +14,19 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
-            Err(AuthError(
-                format!(
-                    "JWT scope '{:?}' is ineligible for Pageserver auth",
-                    claims.scope
-                )
-                .into(),
-            ))
-        }
+        (
+            Scope::Admin
+            | Scope::SafekeeperData
+            | Scope::GenerationsApi
+            | Scope::Infra
+            | Scope::Scrubber,
+            _,
+        ) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Pageserver auth",
+                claims.scope
+            )
+            .into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,8 +125,7 @@ fn main() -> anyhow::Result<()> {

    // after setting up logging, log the effective IO engine choice and read path implementations
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
+    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");

    // The tenants directory contains all the pageserver local disk state.
    // Create if not exists and make sure all the contents are durable before proceeding.
@@ -168,11 +167,7 @@ fn main() -> anyhow::Result<()> {
    let scenario = failpoint_support::init();

    // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        conf.max_file_descriptors,
-        conf.virtual_file_io_engine,
-        conf.io_buffer_alignment,
-    );
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
    page_cache::init(conf.page_cache_size);

    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,9 +174,7 @@ pub struct PageServerConf {
    pub l0_flush: crate::l0_flush::L0FlushConfig,

    /// Direct IO settings
-    pub virtual_file_direct_io: virtual_file::DirectIoMode,
-
-    pub io_buffer_alignment: usize,
+    pub virtual_file_io_mode: virtual_file::IoMode,
 }

 /// Token for authentication to safekeepers
@@ -325,11 +323,10 @@ impl PageServerConf {
            image_compression,
            ephemeral_bytes_per_memory_kb,
            l0_flush,
-            virtual_file_direct_io,
+            virtual_file_io_mode,
            concurrent_tenant_warmup,
            concurrent_tenant_size_logical_size_queries,
            virtual_file_io_engine,
-            io_buffer_alignment,
            tenant_config,
        } = config_toml;

@@ -368,8 +365,6 @@ impl PageServerConf {
            max_vectored_read_bytes,
            image_compression,
            ephemeral_bytes_per_memory_kb,
-            virtual_file_direct_io,
-            io_buffer_alignment,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
@@ -408,6 +403,7 @@ impl PageServerConf {
            l0_flush: l0_flush
                .map(crate::l0_flush::L0FlushConfig::from)
                .unwrap_or_default(),
+            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
@@ -714,6 +715,8 @@ async fn timeline_archival_config_handler(
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;

+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
        tenant
            .apply_timeline_archival_config(timeline_id, request_data.state, ctx)
            .await?;
@@ -2381,17 +2384,13 @@ async fn put_io_engine_handler(
    json_response(StatusCode::OK, ())
 }

-async fn put_io_alignment_handler(
+async fn put_io_mode_handler(
    mut r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&r, None)?;
-    let align: usize = json_request(&mut r).await?;
-    crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
-        ApiError::PreconditionFailed(
-            format!("Requested io alignment ({align}) is not a power of two").into(),
-        )
-    })?;
+    let mode: IoMode = json_request(&mut r).await?;
+    crate::virtual_file::set_io_mode(mode);
    json_response(StatusCode::OK, ())
 }

@@ -3082,9 +3081,7 @@ pub fn make_router(
            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
-        .put("/v1/io_alignment", |r| {
-            api_handler(r, put_io_alignment_handler)
-        })
+        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
            |r| api_handler(r, force_aux_policy_switch_handler),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -493,6 +493,8 @@ pub struct OffloadedTimeline {
    pub tenant_shard_id: TenantShardId,
    pub timeline_id: TimelineId,
    pub ancestor_timeline_id: Option<TimelineId>,
+    /// Whether to retain the branch lsn at the ancestor or not
+    pub ancestor_retain_lsn: Option<Lsn>,

    // TODO: once we persist offloaded state, make this lazily constructed
    pub remote_client: Arc<RemoteTimelineClient>,
@@ -504,10 +506,14 @@ pub struct OffloadedTimeline {

 impl OffloadedTimeline {
    fn from_timeline(timeline: &Timeline) -> Self {
+        let ancestor_retain_lsn = timeline
+            .get_ancestor_timeline_id()
+            .map(|_timeline_id| timeline.get_ancestor_lsn());
        Self {
            tenant_shard_id: timeline.tenant_shard_id,
            timeline_id: timeline.timeline_id,
            ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
+            ancestor_retain_lsn,

            remote_client: timeline.remote_client.clone(),
            delete_progress: timeline.delete_progress.clone(),
@@ -515,6 +521,12 @@ impl OffloadedTimeline {
    }
 }

+#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
+pub enum MaybeOffloaded {
+    Yes,
+    No,
+}
+
 #[derive(Clone)]
 pub enum TimelineOrOffloaded {
    Timeline(Arc<Timeline>),
@@ -2253,12 +2265,13 @@ impl Tenant {

        if activating {
            let timelines_accessor = self.timelines.lock().unwrap();
+            let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap();
            let timelines_to_activate = timelines_accessor
                .values()
                .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));

            // Before activation, populate each Timeline's GcInfo with information about its children
-            self.initialize_gc_info(&timelines_accessor);
+            self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor);

            // Spawn gc and compaction loops. The loops will shut themselves
            // down when they notice that the tenant is inactive.
@@ -3298,6 +3311,7 @@ impl Tenant {
    fn initialize_gc_info(
        &self,
        timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
+        timelines_offloaded: &std::sync::MutexGuard<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
    ) {
        // This function must be called before activation: after activation timeline create/delete operations
        // might happen, and this function is not safe to run concurrently with those.
@@ -3305,20 +3319,37 @@ impl Tenant {

        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
-        let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId)>> = BTreeMap::new();
+        let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId, MaybeOffloaded)>> =
+            BTreeMap::new();
        timelines.iter().for_each(|(timeline_id, timeline_entry)| {
            if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
                let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
-                ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id));
+                ancestor_children.push((
+                    timeline_entry.get_ancestor_lsn(),
+                    *timeline_id,
+                    MaybeOffloaded::No,
+                ));
            }
        });
+        timelines_offloaded
+            .iter()
+            .for_each(|(timeline_id, timeline_entry)| {
+                let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else {
+                    return;
+                };
+                let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else {
+                    return;
+                };
+                let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
+                ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes));
+            });

        // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
        let horizon = self.get_gc_horizon();

        // Populate each timeline's GcInfo with information about its child branches
        for timeline in timelines.values() {
-            let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints
+            let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints
                .remove(&timeline.timeline_id)
                .unwrap_or_default();

@@ -4878,7 +4909,10 @@ mod tests {
        {
            let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
            assert_eq!(branchpoints.len(), 1);
-            assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID));
+            assert_eq!(
+                branchpoints[0],
+                (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No)
+            );
        }

        // You can read the key from the child branch even though the parent is
@@ -8261,8 +8295,8 @@ mod tests {
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![
-                    (Lsn(0x10), tline.timeline_id),
-                    (Lsn(0x20), tline.timeline_id),
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
                    time: Lsn(0x30),
@@ -8489,8 +8523,8 @@ mod tests {
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![
-                    (Lsn(0x10), tline.timeline_id),
-                    (Lsn(0x20), tline.timeline_id),
+                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
+                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
                    time: Lsn(0x30),
@@ -8723,7 +8757,7 @@ mod tests {
            // Update GC info
            let mut guard = parent_tline.gc_info.write().unwrap();
            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
+                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
                    time: Lsn(0x10),
                    space: Lsn(0x10),
@@ -8737,7 +8771,7 @@ mod tests {
            // Update GC info
            let mut guard = branch_tline.gc_info.write().unwrap();
            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
+                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
                    time: Lsn(0x50),
                    space: Lsn(0x50),
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -84,7 +84,7 @@ impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // unlink the file
        // we are clear to do this, because we have entered a gate
-        let path = &self.buffered_writer.as_inner().as_inner().path;
+        let path = self.buffered_writer.as_inner().as_inner().path();
        let res = std::fs::remove_file(path);
        if let Err(e) = res {
            if e.kind() != std::io::ErrorKind::NotFound {
@@ -356,7 +356,7 @@ mod tests {
        }

        let file_contents =
-            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
+            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
        assert_eq!(file_contents, &content[0..cap]);

        let buffer_contents = file.buffered_writer.inspect_buffer();
@@ -392,7 +392,7 @@ mod tests {
            .buffered_writer
            .as_inner()
            .as_inner()
-            .path
+            .path()
            .metadata()
            .unwrap();
        assert_eq!(
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -950,6 +950,7 @@ impl<'a> TenantDownloader<'a> {
        let cancel = &self.secondary_state.cancel;
        let opts = DownloadOpts {
            etag: prev_etag.cloned(),
+            ..Default::default()
        };

        backoff::retry(
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

 use super::{GcError, LogicalSizeCalculationCause, Tenant};
-use crate::tenant::Timeline;
+use crate::tenant::{MaybeOffloaded, Timeline};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

@@ -264,10 +264,12 @@ pub(super) async fn gather_inputs(
        let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
            .retain_lsns
            .iter()
-            .filter(|(lsn, _child_id)| lsn > &ancestor_lsn)
+            .filter(|(lsn, _child_id, is_offloaded)| {
+                lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No
+            })
            .copied()
            // this assumes there are no other retain_lsns than the branchpoints
-            .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint))
+            .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint))
            .collect::<Vec<_>>();

        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -573,7 +573,7 @@ impl DeltaLayerWriterInner {
        ensure!(
            metadata.len() <= S3_UPLOAD_LIMIT,
            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
-            file.path,
+            file.path(),
            metadata.len()
        );

@@ -791,7 +791,7 @@ impl DeltaLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
            .await
            .context("open layer file")?;

@@ -1022,7 +1022,7 @@ impl DeltaLayerInner {
                            blob_meta.key,
                            PageReconstructError::Other(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                kind
                            )),
                        );
@@ -1048,7 +1048,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to decompress blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                            ))),
                        );

@@ -1066,7 +1066,7 @@ impl DeltaLayerInner {
                            meta.meta.key,
                            PageReconstructError::Other(anyhow!(e).context(format!(
                                "Failed to deserialize blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                            ))),
                        );

@@ -1198,7 +1198,6 @@ impl DeltaLayerInner {
        let mut prev: Option<(Key, Lsn, BlobRef)> = None;

        let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
-        let align = virtual_file::get_io_buffer_alignment();

        let max_read_size = self
            .max_vectored_read_bytes
@@ -1247,7 +1246,6 @@ impl DeltaLayerInner {
                        offsets.end.pos(),
                        meta,
                        max_read_size,
-                        align,
                    ))
                }
            } else {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -389,7 +389,7 @@ impl ImageLayerInner {
        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
            .await
            .context("open layer file")?;
        let file_id = page_cache::next_file_id();
@@ -626,7 +626,7 @@ impl ImageLayerInner {
                                    meta.meta.key,
                                    PageReconstructError::Other(anyhow!(e).context(format!(
                                        "Failed to decompress blob from virtual file {}",
-                                        self.file.path,
+                                        self.file.path(),
                                    ))),
                                );

@@ -647,7 +647,7 @@ impl ImageLayerInner {
                            blob_meta.key,
                            PageReconstructError::from(anyhow!(
                                "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                kind
                            )),
                        );
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -392,6 +392,10 @@ impl InMemoryLayer {
        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
    }

+    pub(crate) fn start_lsn(&self) -> Lsn {
+        self.start_lsn
+    }
+
    pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
        self.start_lsn..self.end_lsn_or_max()
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -139,8 +139,10 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::{
-    config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint,
+    config::TenantConf,
+    storage_layer::{inmemory_layer, LayerVisibilityHint},
    upload_queue::NotInitialized,
+    MaybeOffloaded,
 };
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
@@ -450,7 +452,7 @@ pub(crate) struct GcInfo {
    /// Currently, this includes all points where child branches have
    /// been forked off from. In the future, could also include
    /// explicit user-defined snapshot points.
-    pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>,
+    pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>,

    /// The cutoff coordinates, which are combined by selecting the minimum.
    pub(crate) cutoffs: GcCutoffs,
@@ -467,8 +469,13 @@ impl GcInfo {
        self.cutoffs.select_min()
    }

-    pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
-        self.retain_lsns.push((child_lsn, child_id));
+    pub(super) fn insert_child(
+        &mut self,
+        child_id: TimelineId,
+        child_lsn: Lsn,
+        is_offloaded: MaybeOffloaded,
+    ) {
+        self.retain_lsns.push((child_lsn, child_id, is_offloaded));
        self.retain_lsns.sort_by_key(|i| i.0);
    }

@@ -2164,7 +2171,9 @@ impl Timeline {

        if let Some(ancestor) = &ancestor {
            let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
-            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn());
+            // If we construct an explicit timeline object, it's obviously not offloaded
+            let is_offloaded = MaybeOffloaded::No;
+            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
        }

        Arc::new_cyclic(|myself| {
@@ -4875,7 +4884,7 @@ impl Timeline {
            let retain_lsns = gc_info
                .retain_lsns
                .iter()
-                .map(|(lsn, _child_id)| *lsn)
+                .map(|(lsn, _child_id, _is_offloaded)| *lsn)
                .collect();

            // Gets the maximum LSN that holds the valid lease.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -42,7 +42,7 @@ use crate::tenant::storage_layer::{
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::DeltaLayer;
+use crate::tenant::{DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use pageserver_api::config::tenant_conf_defaults::{
    DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
@@ -639,10 +639,28 @@ impl Timeline {
            let children = self.gc_info.read().unwrap().retain_lsns.clone();

            let mut readable_points = Vec::with_capacity(children.len() + 1);
-            for (child_lsn, _child_timeline_id) in &children {
+            for (child_lsn, _child_timeline_id, is_offloaded) in &children {
+                if *is_offloaded == MaybeOffloaded::Yes {
+                    continue;
+                }
                readable_points.push(*child_lsn);
            }
            readable_points.push(head_lsn);
+
+            // The Timeline get page process will walk all InMemoryLayers before it starts walking historic
+            // layers.  That means it might fail to see image layers that overlap with the LSN range of
+            // InMemoryLayers, so there is a de-facto read point at the start_lsn of the oldest InMemoryLayer.
+            //
+            // This behavior in the getpage path is considered a but, and including InMemoryLayer's start_lsn here
+            // is a workaround.  See https://github.com/neondatabase/neon/issues/9185
+            if let Some(oldest_inmemory_layer) = layer_map.frozen_layers.front() {
+                readable_points.push(oldest_inmemory_layer.start_lsn())
+            } else if let Some(open_layer) = layer_map.open_layer.as_ref() {
+                readable_points.push(open_layer.start_lsn());
+            }
+
+            readable_points.sort();
+
            readable_points
        };

@@ -1741,7 +1759,7 @@ impl Timeline {
            let gc_info = self.gc_info.read().unwrap();
            let mut retain_lsns_below_horizon = Vec::new();
            let gc_cutoff = gc_info.cutoffs.select_min();
-            for (lsn, _timeline_id) in &gc_info.retain_lsns {
+            for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                if lsn < &gc_cutoff {
                    retain_lsns_below_horizon.push(*lsn);
                }
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -194,8 +194,6 @@ pub(crate) struct ChunkedVectoredReadBuilder {
    /// Start offset and metadata for each blob in this read
    blobs_at: VecMap<u64, BlobMeta>,
    max_read_size: Option<usize>,
-    /// Chunk size reads are coalesced into.
-    chunk_size: usize,
 }

 /// Computes x / d rounded up.
@@ -204,6 +202,7 @@ fn div_round_up(x: usize, d: usize) -> usize {
 }

 impl ChunkedVectoredReadBuilder {
+    const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment();
    /// Start building a new vectored read.
    ///
    /// Note that by design, this does not check against reading more than `max_read_size` to
@@ -214,21 +213,19 @@ impl ChunkedVectoredReadBuilder {
        end_offset: u64,
        meta: BlobMeta,
        max_read_size: Option<usize>,
-        chunk_size: usize,
    ) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
            .append(start_offset, meta)
            .expect("First insertion always succeeds");

-        let start_blk_no = start_offset as usize / chunk_size;
-        let end_blk_no = div_round_up(end_offset as usize, chunk_size);
+        let start_blk_no = start_offset as usize / Self::CHUNK_SIZE;
+        let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE);
        Self {
            start_blk_no,
            end_blk_no,
            blobs_at,
            max_read_size,
-            chunk_size,
        }
    }

@@ -237,18 +234,12 @@ impl ChunkedVectoredReadBuilder {
        end_offset: u64,
        meta: BlobMeta,
        max_read_size: usize,
-        align: usize,
    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align)
+        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size))
    }

-    pub(crate) fn new_streaming(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        align: usize,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, None, align)
+    pub(crate) fn new_streaming(start_offset: u64, end_offset: u64, meta: BlobMeta) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, None)
    }

    /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
@@ -256,12 +247,12 @@ impl ChunkedVectoredReadBuilder {
    /// The resulting size also must be below the max read size.
    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
        tracing::trace!(start, end, "trying to extend");
-        let start_blk_no = start as usize / self.chunk_size;
-        let end_blk_no = div_round_up(end as usize, self.chunk_size);
+        let start_blk_no = start as usize / Self::CHUNK_SIZE;
+        let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE);

        let not_limited_by_max_read_size = {
            if let Some(max_read_size) = self.max_read_size {
-                let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size;
+                let coalesced_size = (end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE;
                coalesced_size <= max_read_size
            } else {
                true
@@ -292,12 +283,12 @@ impl ChunkedVectoredReadBuilder {
    }

    pub(crate) fn size(&self) -> usize {
-        (self.end_blk_no - self.start_blk_no) * self.chunk_size
+        (self.end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE
    }

    pub(crate) fn build(self) -> VectoredRead {
-        let start = (self.start_blk_no * self.chunk_size) as u64;
-        let end = (self.end_blk_no * self.chunk_size) as u64;
+        let start = (self.start_blk_no * Self::CHUNK_SIZE) as u64;
+        let end = (self.end_blk_no * Self::CHUNK_SIZE) as u64;
        VectoredRead {
            start,
            end,
@@ -328,18 +319,14 @@ pub struct VectoredReadPlanner {
    prev: Option<(Key, Lsn, u64, BlobFlag)>,

    max_read_size: usize,
-
-    align: usize,
 }

 impl VectoredReadPlanner {
    pub fn new(max_read_size: usize) -> Self {
-        let align = virtual_file::get_io_buffer_alignment();
        Self {
            blobs: BTreeMap::new(),
            prev: None,
            max_read_size,
-            align,
        }
    }

@@ -418,7 +405,6 @@ impl VectoredReadPlanner {
                        end_offset,
                        BlobMeta { key, lsn },
                        self.max_read_size,
-                        self.align,
                    );

                    let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -472,13 +458,13 @@ impl<'a> VectoredBlobReader<'a> {
        );

        if cfg!(debug_assertions) {
-            let align = virtual_file::get_io_buffer_alignment() as u64;
+            const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64;
            debug_assert_eq!(
-                read.start % align,
+                read.start % ALIGN,
                0,
                "Read start at {} does not satisfy the required io buffer alignment ({} bytes)",
                read.start,
-                align
+                ALIGN
            );
        }

@@ -553,22 +539,18 @@ pub struct StreamingVectoredReadPlanner {
    max_cnt: usize,
    /// Size of the current batch
    cnt: usize,
-
-    align: usize,
 }

 impl StreamingVectoredReadPlanner {
    pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
        assert!(max_cnt > 0);
        assert!(max_read_size > 0);
-        let align = virtual_file::get_io_buffer_alignment();
        Self {
            read_builder: None,
            prev: None,
            max_cnt,
            max_read_size,
            cnt: 0,
-            align,
        }
    }

@@ -621,7 +603,6 @@ impl StreamingVectoredReadPlanner {
                        start_offset,
                        end_offset,
                        BlobMeta { key, lsn },
-                        self.align,
                    ))
                };
            }
@@ -656,9 +637,9 @@ mod tests {
    use super::*;

    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
-        let align = virtual_file::get_io_buffer_alignment() as u64;
-        assert_eq!(read.start % align, 0);
-        assert_eq!(read.start / align, offset_range.first().unwrap().2 / align);
+        const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64;
+        assert_eq!(read.start % ALIGN, 0);
+        assert_eq!(read.start / ALIGN, offset_range.first().unwrap().2 / ALIGN);

        let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();

@@ -676,32 +657,27 @@ mod tests {
    fn planner_chunked_coalesce_all_test() {
        use crate::virtual_file;

-        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
+        const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64;

-        // The test explicitly does not check chunk size < 512
-        if chunk_size < 512 {
-            return;
-        }
-
-        let max_read_size = chunk_size as usize * 8;
+        let max_read_size = CHUNK_SIZE as usize * 8;
        let key = Key::MIN;
        let lsn = Lsn(0);

        let blob_descriptions = [
-            (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN
-            (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size / 2, BlobFlag::None),
-            (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size, BlobFlag::None),
-            (key, lsn, chunk_size * 2 - 1, BlobFlag::None),
-            (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size * 3 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 5 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
-            (key, lsn, chunk_size * 7 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
-            (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk
-            (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
+            (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
+            (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
+            (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
+            (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
+            (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
        ];

        let ranges = [
@@ -780,19 +756,19 @@ mod tests {

    #[test]
    fn planner_replacement_test() {
-        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
-        let max_read_size = 128 * chunk_size as usize;
+        const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64;
+        let max_read_size = 128 * CHUNK_SIZE as usize;
        let first_key = Key::MIN;
        let second_key = first_key.next();
        let lsn = Lsn(0);

        let blob_descriptions = vec![
            (first_key, lsn, 0, BlobFlag::None),          // First in read 1
-            (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll),
-            (second_key, lsn, 3 * chunk_size, BlobFlag::None),
-            (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2
-            (second_key, lsn, 5 * chunk_size, BlobFlag::None),       // Last in read 2
+            (first_key, lsn, CHUNK_SIZE, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * CHUNK_SIZE, BlobFlag::ReplaceAll),
+            (second_key, lsn, 3 * CHUNK_SIZE, BlobFlag::None),
+            (second_key, lsn, 4 * CHUNK_SIZE, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * CHUNK_SIZE, BlobFlag::None),       // Last in read 2
        ];

        let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
@@ -802,7 +778,7 @@ mod tests {
            planner.handle(key, lsn, offset, flag);
        }

-        planner.handle_range_end(6 * chunk_size);
+        planner.handle_range_end(6 * CHUNK_SIZE);

        let reads = planner.finish();
        assert_eq!(reads.len(), 2);
@@ -947,7 +923,6 @@ mod tests {
        let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
        let mut buf = BytesMut::with_capacity(reserved_bytes);

-        let align = virtual_file::get_io_buffer_alignment();
        let vectored_blob_reader = VectoredBlobReader::new(&file);
        let meta = BlobMeta {
            key: Key::MIN,
@@ -959,8 +934,7 @@ mod tests {
            if idx + 1 == offsets.len() {
                continue;
            }
-            let read_builder =
-                ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align);
+            let read_builder = ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
            let read = read_builder.build();
            let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
            assert_eq!(result.blobs.len(), 1);
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -23,10 +23,12 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+#[cfg(target_os = "linux")]
+use std::os::unix::fs::OpenOptionsExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;

@@ -38,7 +40,7 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::DirectIoMode;
+pub(crate) use api::IoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
@@ -61,6 +63,171 @@ pub(crate) mod owned_buffers_io {
    }
 }

+#[derive(Debug)]
+pub struct VirtualFile {
+    inner: VirtualFileInner,
+    _mode: IoMode,
+}
+
+impl VirtualFile {
+    /// Open a file in read-only mode. Like File::open.
+    pub async fn open<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let inner = VirtualFileInner::open(path, ctx).await?;
+        Ok(VirtualFile {
+            inner,
+            _mode: IoMode::Buffered,
+        })
+    }
+
+    /// Open a file in read-only mode. Like File::open.
+    ///
+    /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
+    pub async fn open_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
+    }
+
+    pub async fn create<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let inner = VirtualFileInner::create(path, ctx).await?;
+        Ok(VirtualFile {
+            inner,
+            _mode: IoMode::Buffered,
+        })
+    }
+
+    pub async fn create_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        VirtualFile::open_with_options_v2(
+            path.as_ref(),
+            OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
+        )
+        .await
+    }
+
+    pub async fn open_with_options<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &OpenOptions,
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+        Ok(VirtualFile {
+            inner,
+            _mode: IoMode::Buffered,
+        })
+    }
+
+    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &OpenOptions,
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let file = match get_io_mode() {
+            IoMode::Buffered => {
+                let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+                VirtualFile {
+                    inner,
+                    _mode: IoMode::Buffered,
+                }
+            }
+            #[cfg(target_os = "linux")]
+            IoMode::Direct => {
+                let inner = VirtualFileInner::open_with_options(
+                    path,
+                    open_options.clone().custom_flags(nix::libc::O_DIRECT),
+                    ctx,
+                )
+                .await?;
+                VirtualFile {
+                    inner,
+                    _mode: IoMode::Direct,
+                }
+            }
+        };
+        Ok(file)
+    }
+
+    pub fn path(&self) -> &Utf8Path {
+        self.inner.path.as_path()
+    }
+
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
+        content: B,
+    ) -> std::io::Result<()> {
+        VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await
+    }
+
+    pub async fn sync_all(&self) -> Result<(), Error> {
+        self.inner.sync_all().await
+    }
+
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        self.inner.sync_data().await
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        self.inner.metadata().await
+    }
+
+    pub fn remove(self) {
+        self.inner.remove();
+    }
+
+    pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        self.inner.seek(pos).await
+    }
+
+    pub async fn read_exact_at<Buf>(
+        &self,
+        slice: Slice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<Slice<Buf>, Error>
+    where
+        Buf: IoBufMut + Send,
+    {
+        self.inner.read_exact_at(slice, offset, ctx).await
+    }
+
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        self.inner.read_exact_at_page(page, offset, ctx).await
+    }
+
+    pub async fn write_all_at<Buf: IoBuf + Send>(
+        &self,
+        buf: FullSlice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        self.inner.write_all_at(buf, offset, ctx).await
+    }
+
+    pub async fn write_all<Buf: IoBuf + Send>(
+        &mut self,
+        buf: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<usize, Error>) {
+        self.inner.write_all(buf, ctx).await
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -77,7 +244,7 @@ pub(crate) mod owned_buffers_io {
 /// 'tag' field is used to detect whether the handle still is valid or not.
 ///
 #[derive(Debug)]
-pub struct VirtualFile {
+pub struct VirtualFileInner {
    /// Lazy handle to the global file descriptor cache. The slot that this points to
    /// might contain our File, or it may be empty, or it may contain a File that
    /// belongs to a different VirtualFile.
@@ -350,12 +517,12 @@ macro_rules! with_file {
    }};
 }

-impl VirtualFile {
+impl VirtualFileInner {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
    }

@@ -364,7 +531,7 @@ impl VirtualFile {
    pub async fn create<P: AsRef<Utf8Path>>(
        path: P,
        ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        Self::open_with_options(
            path.as_ref(),
            OpenOptions::new().write(true).create(true).truncate(true),
@@ -382,7 +549,7 @@ impl VirtualFile {
        path: P,
        open_options: &OpenOptions,
        _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
        let path_ref = path.as_ref();
        let path_str = path_ref.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -423,7 +590,7 @@ impl VirtualFile {
        reopen_options.create_new(false);
        reopen_options.truncate(false);

-        let vfile = VirtualFile {
+        let vfile = VirtualFileInner {
            handle: RwLock::new(handle),
            pos: 0,
            path: path_ref.to_path_buf(),
@@ -1034,6 +1201,21 @@ impl tokio_epoll_uring::IoFd for FileGuard {

 #[cfg(test)]
 impl VirtualFile {
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
+        self.inner.read_blk(blknum, ctx).await
+    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
+}
+
+#[cfg(test)]
+impl VirtualFileInner {
    pub(crate) async fn read_blk(
        &self,
        blknum: u32,
@@ -1067,7 +1249,7 @@ impl VirtualFile {
    }
 }

-impl Drop for VirtualFile {
+impl Drop for VirtualFileInner {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
        let handle = self.handle.get_mut();
@@ -1143,15 +1325,10 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
+pub fn init(num_slots: usize, engine: IoEngineKind) {
    if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
        panic!("virtual_file::init called twice");
    }
-    if set_io_buffer_alignment(io_buffer_alignment).is_err() {
-        panic!(
-            "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}"
-        );
-    }
    io_engine::init(engine);
    crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1175,47 +1352,20 @@ fn get_open_files() -> &'static OpenFiles {
    }
 }

-static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
-
-/// Returns true if the alignment is a power of two and is greater or equal to 512.
-fn is_valid_io_buffer_alignment(align: usize) -> bool {
-    align.is_power_of_two() && align >= 512
-}
-
-/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is
-/// not a power of two or less than 512 bytes.
-#[allow(unused)]
-pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
-    if is_valid_io_buffer_alignment(align) {
-        IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
-        Ok(())
-    } else {
-        Err(align)
-    }
-}
-
 /// Gets the io buffer alignment.
-///
-/// This function should be used for getting the actual alignment value to use.
-pub(crate) fn get_io_buffer_alignment() -> usize {
-    let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
-
-    if cfg!(test) {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
-        if let Some(test_align) = utils::env::var(env_var_name) {
-            if is_valid_io_buffer_alignment(test_align) {
-                test_align
-            } else {
-                panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}");
-            }
-        } else {
-            align
-        }
-    } else {
-        align
-    }
+pub(crate) const fn get_io_buffer_alignment() -> usize {
+    DEFAULT_IO_BUFFER_ALIGNMENT
 }

+static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+
+pub(crate) fn set_io_mode(mode: IoMode) {
+    IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
+}
+
+pub(crate) fn get_io_mode() -> IoMode {
+    IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
+}
 #[cfg(test)]
 mod tests {
    use crate::context::DownloadBehavior;
@@ -1524,7 +1674,7 @@ mod tests {
        // Open the file many times.
        let mut files = Vec::new();
        for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(
+            let f = VirtualFileInner::open_with_options(
                &test_file_path,
                OpenOptions::new().read(true),
                &ctx,
@@ -1576,7 +1726,7 @@ mod tests {
        let path = testdir.join("myfile");
        let tmp_path = testdir.join("myfile.tmp");

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1585,7 +1735,7 @@ mod tests {
        assert!(!tmp_path.exists());
        drop(file);

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
            .await
            .unwrap();
        let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1608,7 +1758,7 @@ mod tests {
        std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
        assert!(tmp_path.exists());

-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
            .await
            .unwrap();

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -43,6 +43,7 @@
 #include "hll.h"
 #include "bitmap.h"
 #include "neon.h"
+#include "neon_perf_counters.h"

 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

@@ -114,7 +115,9 @@ typedef struct FileCacheControl
 	uint32		limit;			/* shared copy of lfc_size_limit */
 	uint64		hits;
 	uint64		misses;
-	uint64		writes;
+	uint64		writes;			/* number of writes issued */
+	uint64		time_read;		/* time spent reading (us) */
+	uint64		time_write;		/* time spent writing (us) */
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
@@ -270,6 +273,8 @@ lfc_shmem_startup(void)
 		lfc_ctl->hits = 0;
 		lfc_ctl->misses = 0;
 		lfc_ctl->writes = 0;
+		lfc_ctl->time_read = 0;
+		lfc_ctl->time_write = 0;
 		dlist_init(&lfc_ctl->lru);
 		dlist_init(&lfc_ctl->holes);

@@ -701,6 +706,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
+		uint64	io_time_us = 0;
 		Assert(blocks_in_chunk > 0);

 		for (int i = 0; i < blocks_in_chunk; i++)
@@ -795,6 +801,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			lfc_ctl->misses += iteration_misses;
 			pgBufferUsage.file_cache.hits += iteration_hits;
 			pgBufferUsage.file_cache.misses += iteration_misses;
+
+			if (iteration_hits)
+			{
+				lfc_ctl->time_read += io_time_us;
+				inc_page_cache_read_wait(io_time_us);
+			}
+
 			CriticalAssert(entry->access_count > 0);
 			if (--entry->access_count == 0)
 				dlist_push_tail(&lfc_ctl->lru, &entry->list_node);
@@ -859,6 +872,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		struct iovec iov[PG_IOV_MAX];
 		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
 		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
+		instr_time io_start, io_end;
 		Assert(blocks_in_chunk > 0);

 		for (int i = 0; i < blocks_in_chunk; i++)
@@ -947,12 +961,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;
-		lfc_ctl->writes += blocks_in_chunk;
 		LWLockRelease(lfc_lock);

 		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
+		INSTR_TIME_SET_CURRENT(io_start);
 		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
 					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+		INSTR_TIME_SET_CURRENT(io_end);
 		pgstat_report_wait_end();

 		if (rc != BLCKSZ * blocks_in_chunk)
@@ -965,9 +980,17 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 			if (lfc_ctl->generation == generation)
 			{
+				uint64	time_spent_us;
 				CriticalAssert(LFC_ENABLED());
 				/* Place entry to the head of LRU list */
 				CriticalAssert(entry->access_count > 0);
+
+				lfc_ctl->writes += blocks_in_chunk;
+				INSTR_TIME_SUBTRACT(io_start, io_end);
+				time_spent_us = INSTR_TIME_GET_MICROSEC(io_start);
+				lfc_ctl->time_write += time_spent_us;
+				inc_page_cache_write_wait(time_spent_us);
+
 				if (--entry->access_count == 0)
 					dlist_push_tail(&lfc_ctl->lru, &entry->list_node);

--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -50,28 +50,52 @@ NeonPerfCountersShmemInit(void)
 	}
 }

-/*
- * Count a GetPage wait operation.
- */
-void
-inc_getpage_wait(uint64 latency_us)
+static inline void
+inc_iohist(IOHistogram hist, uint64 latency_us)
 {
 	int			lo = 0;
-	int			hi = NUM_GETPAGE_WAIT_BUCKETS - 1;
+	int			hi = NUM_IO_WAIT_BUCKETS - 1;

 	/* Find the right bucket with binary search */
 	while (lo < hi)
 	{
 		int			mid = (lo + hi) / 2;

-		if (latency_us < getpage_wait_bucket_thresholds[mid])
+		if (latency_us < io_wait_bucket_thresholds[mid])
 			hi = mid;
 		else
 			lo = mid + 1;
 	}
-	MyNeonCounters->getpage_wait_us_bucket[lo]++;
-	MyNeonCounters->getpage_wait_us_sum += latency_us;
-	MyNeonCounters->getpage_wait_us_count++;
+	hist->wait_us_bucket[lo]++;
+	hist->wait_us_sum += latency_us;
+	hist->wait_us_count++;
+}
+
+/*
+ * Count a GetPage wait operation.
+ */
+void
+inc_getpage_wait(uint64 latency)
+{
+	inc_iohist(&MyNeonCounters->getpage_hist, latency);
+}
+
+/*
+ * Count an LFC read wait operation.
+ */
+void
+inc_page_cache_read_wait(uint64 latency)
+{
+	inc_iohist(&MyNeonCounters->file_cache_read_hist, latency);
+}
+
+/*
+ * Count an LFC write wait operation.
+ */
+void
+inc_page_cache_write_wait(uint64 latency)
+{
+	inc_iohist(&MyNeonCounters->file_cache_write_hist, latency);
 }

 /*
@@ -81,77 +105,91 @@ inc_getpage_wait(uint64 latency_us)

 typedef struct
 {
-	char	   *name;
+	const char *name;
 	bool		is_bucket;
 	double		bucket_le;
 	double		value;
 } metric_t;

-static metric_t *
-neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
+static int
+histogram_to_metrics(IOHistogram histogram,
+					 metric_t *metrics,
+					 const char *count,
+					 const char *sum,
+					 const char *bucket)
 {
-#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8)
-	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
-	uint64		bucket_accum;
-	int			i = 0;
+	int		i = 0;
+	uint64	bucket_accum = 0;

-	metrics[i].name = "getpage_wait_seconds_count";
+	metrics[i].name = count;
 	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_wait_us_count;
+	metrics[i].value = (double) histogram->wait_us_count;
 	i++;
-	metrics[i].name = "getpage_wait_seconds_sum";
+	metrics[i].name = sum;
 	metrics[i].is_bucket = false;
-	metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0;
+	metrics[i].value = (double) histogram->wait_us_sum / 1000000.0;
 	i++;
-
-	bucket_accum = 0;
-	for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
+	for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++)
 	{
-		uint64		threshold = getpage_wait_bucket_thresholds[bucketno];
+		uint64		threshold = io_wait_bucket_thresholds[bucketno];

-		bucket_accum += counters->getpage_wait_us_bucket[bucketno];
+		bucket_accum += histogram->wait_us_bucket[bucketno];

-		metrics[i].name = "getpage_wait_seconds_bucket";
+		metrics[i].name = bucket;
 		metrics[i].is_bucket = true;
 		metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
 		metrics[i].value = (double) bucket_accum;
 		i++;
 	}
-	metrics[i].name = "getpage_prefetch_requests_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_prefetch_requests_total;
-	i++;
-	metrics[i].name = "getpage_sync_requests_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_sync_requests_total;
-	i++;
-	metrics[i].name = "getpage_prefetch_misses_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_prefetch_misses_total;
-	i++;
-	metrics[i].name = "getpage_prefetch_discards_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->getpage_prefetch_discards_total;
-	i++;
-	metrics[i].name = "pageserver_requests_sent_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->pageserver_requests_sent_total;
-	i++;
-	metrics[i].name = "pageserver_disconnects_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->pageserver_disconnects_total;
-	i++;
-	metrics[i].name = "pageserver_send_flushes_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->pageserver_send_flushes_total;
-	i++;
-	metrics[i].name = "file_cache_hits_total";
-	metrics[i].is_bucket = false;
-	metrics[i].value = (double) counters->file_cache_hits_total;
-	i++;
+
+	return i;
+}
+
+static metric_t *
+neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
+{
+#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10)
+	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
+	int			i = 0;
+
+#define APPEND_METRIC(_name) do { \
+		metrics[i].name = #_name; \
+		metrics[i].is_bucket = false; \
+		metrics[i].value = (double) counters->_name; \
+		i++; \
+	} while (false)
+
+	i += histogram_to_metrics(&counters->getpage_hist, &metrics[i],
+							  "getpage_wait_seconds_count",
+							  "getpage_wait_seconds_sum",
+							  "getpage_wait_seconds_bucket");
+
+	APPEND_METRIC(getpage_prefetch_requests_total);
+	APPEND_METRIC(getpage_sync_requests_total);
+	APPEND_METRIC(getpage_prefetch_misses_total);
+	APPEND_METRIC(getpage_prefetch_discards_total);
+	APPEND_METRIC(pageserver_requests_sent_total);
+	APPEND_METRIC(pageserver_disconnects_total);
+	APPEND_METRIC(pageserver_send_flushes_total);
+	APPEND_METRIC(pageserver_open_requests);
+	APPEND_METRIC(getpage_prefetches_buffered);
+
+	APPEND_METRIC(file_cache_hits_total);
+
+	i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i],
+							  "file_cache_read_wait_seconds_count",
+							  "file_cache_read_wait_seconds_sum",
+							  "file_cache_read_wait_seconds_bucket");
+	i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i],
+							  "file_cache_write_wait_seconds_count",
+							  "file_cache_write_wait_seconds_sum",
+							  "file_cache_write_wait_seconds_bucket");

 	Assert(i == NUM_METRICS);

+#undef APPEND_METRIC
+#undef NUM_METRICS
+
 	/* NULL entry marks end of array */
 	metrics[i].name = NULL;
 	metrics[i].value = 0;
@@ -216,6 +254,15 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
 	return (Datum) 0;
 }

+static inline void
+histogram_merge_into(IOHistogram into, IOHistogram from)
+{
+	into->wait_us_count += from->wait_us_count;
+	into->wait_us_sum += from->wait_us_sum;
+	for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++)
+		into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno];
+}
+
 PG_FUNCTION_INFO_V1(neon_get_perf_counters);
 Datum
 neon_get_perf_counters(PG_FUNCTION_ARGS)
@@ -234,10 +281,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 	{
 		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];

-		totals.getpage_wait_us_count += counters->getpage_wait_us_count;
-		totals.getpage_wait_us_sum += counters->getpage_wait_us_sum;
-		for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++)
-			totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno];
+		histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist);
 		totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
 		totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
 		totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
@@ -245,7 +289,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 		totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total;
 		totals.pageserver_disconnects_total += counters->pageserver_disconnects_total;
 		totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total;
+		totals.pageserver_open_requests += counters->pageserver_open_requests;
+		totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered;
 		totals.file_cache_hits_total += counters->file_cache_hits_total;
+		histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
+		histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
 	}

 	metrics = neon_perf_counters_to_metrics(&totals);
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -15,17 +15,26 @@
 #include "storage/proc.h"
 #endif

-static const uint64 getpage_wait_bucket_thresholds[] = {
-	      20,       30,       60,       100,  /* 0      -  100 us */
+static const uint64 io_wait_bucket_thresholds[] = {
+	       2,        3,        6,        10,  /* 0 us   - 10 us */
+	      20,       30,       60,       100,  /* 10 us  - 100 us */
 	     200,      300,      600,	   1000,  /* 100 us - 1 ms */
 	    2000,     3000,     6000,     10000,  /* 1 ms   - 10 ms */
 	   20000,    30000,    60000,    100000,  /* 10 ms  - 100 ms */
 	  200000,   300000,   600000,   1000000,  /* 100 ms - 1 s */
 	 2000000,  3000000,  6000000,  10000000,  /* 1 s - 10 s */
-    20000000, 30000000, 60000000, 100000000,  /* 10 s - 100 s */
 	UINT64_MAX,
 };
-#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds))
+#define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds))
+
+typedef struct IOHistogramData
+{
+	uint64		wait_us_count;
+	uint64		wait_us_sum;
+	uint64		wait_us_bucket[NUM_IO_WAIT_BUCKETS];
+} IOHistogramData;
+
+typedef IOHistogramData *IOHistogram;

 typedef struct
 {
@@ -39,9 +48,7 @@ typedef struct
 	 * the backend, but the 'neon_backend_perf_counters' view will convert
 	 * them to seconds, to make them more idiomatic as prometheus metrics.
 	 */
-	uint64		getpage_wait_us_count;
-	uint64		getpage_wait_us_sum;
-	uint64		getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS];
+	IOHistogramData getpage_hist;

 	/*
 	 * Total number of speculative prefetch Getpage requests and synchronous
@@ -50,7 +57,11 @@ typedef struct
 	uint64		getpage_prefetch_requests_total;
 	uint64		getpage_sync_requests_total;

-	/* XXX: It's not clear to me when these misses happen. */
+	/*
+	 * Total number of readahead misses; consisting of either prefetches that
+	 * don't satisfy the LSN bounds, or cases where no readahead was issued
+	 * for the read.
+	 */
 	uint64		getpage_prefetch_misses_total;

 	/*
@@ -80,6 +91,16 @@ typedef struct
 	 * this can be smaller than pageserver_requests_sent_total.
 	 */
 	uint64		pageserver_send_flushes_total;
+	
+	/*
+	 * Number of open requests to PageServer.
+	 */
+	uint64		pageserver_open_requests;
+
+	/*
+	 * Number of unused prefetches currently cached in this backend.
+	 */
+	uint64		getpage_prefetches_buffered;

 	/*
 	 * Number of requests satisfied from the LFC.
@@ -91,6 +112,9 @@ typedef struct
 	 */
 	uint64		file_cache_hits_total;

+	/* LFC I/O time buckets */
+	IOHistogramData file_cache_read_hist;
+	IOHistogramData file_cache_write_hist;
 } neon_per_backend_counters;

 /* Pointer to the shared memory array of neon_per_backend_counters structs */
@@ -111,6 +135,8 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared;
 #endif

 extern void inc_getpage_wait(uint64 latency);
+extern void inc_page_cache_read_wait(uint64 latency);
+extern void inc_page_cache_write_wait(uint64 latency);

 extern Size NeonPerfCountersShmemSize(void);
 extern void NeonPerfCountersShmemInit(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -488,6 +488,11 @@ readahead_buffer_resize(int newsize, void *extra)
 		newPState->n_unused -= 1;
 	}

+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
 		prefetch_set_unused(end);
@@ -621,6 +626,8 @@ prefetch_read(PrefetchRequest *slot)
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
 		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;

 		/* update slot state */
 		slot->status = PRFS_RECEIVED;
@@ -674,6 +681,15 @@ prefetch_on_ps_disconnect(void)

 		prefetch_set_unused(ring_index);
 	}
+
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available 
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
 }

 /*
@@ -706,6 +722,9 @@ prefetch_set_unused(uint64 ring_index)

 		MyPState->n_responses_buffered -= 1;
 		MyPState->n_unused += 1;
+
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
 	}
 	else
 	{
@@ -820,6 +839,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
 	hashkey.buftag = tag;

 Retry:
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available 
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+
 	min_ring_index = UINT64_MAX;
 	for (int i = 0; i < nblocks; i++)
 	{
@@ -1001,6 +1029,9 @@ Retry:
 		prefetch_do_request(slot, lsns);
 	}

+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+
 	Assert(any_hits);

 	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
@@ -1076,8 +1107,10 @@ page_server_request(void const *req)
 			{
 				/* do nothing */
 			}
+			MyNeonCounters->pageserver_open_requests++;
 			consume_prefetch_responses();
 			resp = page_server->receive(shard_no);
+			MyNeonCounters->pageserver_open_requests--;
 		}
 		PG_CATCH();
 		{
@@ -1086,6 +1119,8 @@ page_server_request(void const *req)
 			 * point, but this currently seems fine for now.
 			 */
 			page_server->disconnect(shard_no);
+			MyNeonCounters->pageserver_open_requests = 0;
+
 			PG_RE_THROW();
 		}
 		PG_END_TRY();
--- a/poetry.lock
+++ b/poetry.lock
@@ -2095,6 +2095,7 @@ files = [
    {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
    {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2103,6 +2104,8 @@ files = [
    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
    {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@@ -2584,6 +2587,7 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2729,21 +2733,22 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]

 [[package]]
 name = "responses"
-version = "0.21.0"
+version = "0.25.3"
 description = "A utility library for mocking out the `requests` Python library."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "responses-0.21.0-py3-none-any.whl", hash = "sha256:2dcc863ba63963c0c3d9ee3fa9507cbe36b7d7b0fccb4f0bdfd9e96c539b1487"},
-    {file = "responses-0.21.0.tar.gz", hash = "sha256:b82502eb5f09a0289d8e209e7bad71ef3978334f56d09b444253d5ad67bf5253"},
+    {file = "responses-0.25.3-py3-none-any.whl", hash = "sha256:521efcbc82081ab8daa588e08f7e8a64ce79b91c39f6e62199b19159bea7dbcb"},
+    {file = "responses-0.25.3.tar.gz", hash = "sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba"},
 ]

 [package.dependencies]
-requests = ">=2.0,<3.0"
-urllib3 = ">=1.25.10"
+pyyaml = "*"
+requests = ">=2.30.0,<3.0"
+urllib3 = ">=1.25.10,<3.0"

 [package.extras]
-tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"]
+tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"]

 [[package]]
 name = "rfc3339-validator"
@@ -3137,6 +3142,16 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -39,7 +39,7 @@ http.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
-hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
+hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
 indexmap.workspace = true
@@ -77,7 +77,7 @@ subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tokio-postgres.workspace = true
+tokio-postgres = { workspace = true, features = ["with-serde_json-1"] }
 tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
@@ -101,7 +101,7 @@ jose-jwa = "0.1.2"
 jose-jwk = { version = "0.1.2", features = ["p256", "p384", "rsa"] }
 signature = "2"
 ecdsa = "0.16"
-p256 = "0.13"
+p256 = { version = "0.13", features = ["jwk"] }
 rsa = "0.9"

 workspace_hack.workspace = true
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,18 +1,24 @@
 use crate::{
-    auth, compute,
+    auth,
+    cache::Cached,
+    compute,
    config::AuthenticationConfig,
    context::RequestMonitoring,
-    control_plane::{self, provider::NodeInfo},
+    control_plane::{self, provider::NodeInfo, CachedNodeInfo},
    error::{ReportableError, UserFacingError},
+    proxy::connect_compute::ComputeConnectBackend,
    stream::PqStream,
    waiters,
 };
+use async_trait::async_trait;
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};

+use super::ComputeCredentialKeys;
+
 #[derive(Debug, Error)]
 pub(crate) enum WebAuthError {
    #[error(transparent)]
@@ -25,6 +31,11 @@ pub(crate) enum WebAuthError {
    Io(#[from] std::io::Error),
 }

+#[derive(Debug)]
+pub struct ConsoleRedirectBackend {
+    console_uri: reqwest::Url,
+}
+
 impl UserFacingError for WebAuthError {
    fn to_string_client(&self) -> String {
        "Internal error".to_string()
@@ -57,7 +68,40 @@ pub(crate) fn new_psql_session_id() -> String {
    hex::encode(rand::random::<[u8; 8]>())
 }

-pub(super) async fn authenticate(
+impl ConsoleRedirectBackend {
+    pub fn new(console_uri: reqwest::Url) -> Self {
+        Self { console_uri }
+    }
+
+    pub(crate) async fn authenticate(
+        &self,
+        ctx: &RequestMonitoring,
+        auth_config: &'static AuthenticationConfig,
+        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    ) -> auth::Result<ConsoleRedirectNodeInfo> {
+        authenticate(ctx, auth_config, &self.console_uri, client)
+            .await
+            .map(ConsoleRedirectNodeInfo)
+    }
+}
+
+pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo);
+
+#[async_trait]
+impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
+    async fn wake_compute(
+        &self,
+        _ctx: &RequestMonitoring,
+    ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
+        Ok(Cached::new_uncached(self.0.clone()))
+    }
+
+    fn get_keys(&self) -> &ComputeCredentialKeys {
+        &ComputeCredentialKeys::None
+    }
+}
+
+async fn authenticate(
    ctx: &RequestMonitoring,
    auth_config: &'static AuthenticationConfig,
    link_uri: &reqwest::Url,
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -17,6 +17,8 @@ use crate::{
    RoleName,
 };

+use super::ComputeCredentialKeys;
+
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
@@ -241,7 +243,7 @@ impl JwkCacheEntryLock {
        endpoint: EndpointId,
        role_name: &RoleName,
        fetch: &F,
-    ) -> Result<(), anyhow::Error> {
+    ) -> Result<ComputeCredentialKeys, anyhow::Error> {
        // JWT compact form is defined to be
        // <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
        // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
@@ -300,9 +302,9 @@ impl JwkCacheEntryLock {
            key => bail!("unsupported key type {key:?}"),
        };

-        let payload = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
+        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
            .context("Provided authentication token is not a valid JWT encoding")?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payload)
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)
            .context("Provided authentication token is not a valid JWT encoding")?;

        tracing::debug!(?payload, "JWT signature valid with claims");
@@ -327,7 +329,7 @@ impl JwkCacheEntryLock {
            );
        }

-        Ok(())
+        Ok(ComputeCredentialKeys::JwtPayload(payloadb))
    }
 }

@@ -339,7 +341,7 @@ impl JwkCache {
        role_name: &RoleName,
        fetch: &F,
        jwt: &str,
-    ) -> Result<(), anyhow::Error> {
+    ) -> Result<ComputeCredentialKeys, anyhow::Error> {
        // try with just a read lock first
        let key = (endpoint.clone(), role_name.clone());
        let entry = self.map.get(&key).as_deref().map(Arc::clone);
@@ -571,7 +573,7 @@ mod tests {
    use bytes::Bytes;
    use http::Response;
    use http_body_util::Full;
-    use hyper1::service::service_fn;
+    use hyper::service::service_fn;
    use hyper_util::rt::TokioIo;
    use rand::rngs::OsRng;
    use rsa::pkcs8::DecodePrivateKey;
@@ -736,7 +738,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
        });

        let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
-        let server = hyper1::server::conn::http1::Builder::new();
+        let server = hyper::server::conn::http1::Builder::new();
        let addr = listener.local_addr().unwrap();
        tokio::spawn(async move {
            loop {
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -8,6 +8,7 @@ use std::net::IpAddr;
 use std::sync::Arc;
 use std::time::Duration;

+pub use console_redirect::ConsoleRedirectBackend;
 pub(crate) use console_redirect::WebAuthError;
 use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
@@ -21,7 +22,7 @@ use crate::cache::Cached;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetAuthInfoError;
 use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend};
-use crate::control_plane::{AuthSecret, NodeInfo};
+use crate::control_plane::AuthSecret;
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
@@ -36,7 +37,7 @@ use crate::{
        provider::{CachedAllowedIps, CachedNodeInfo},
        Api,
    },
-    stream, url,
+    stream,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};

@@ -65,11 +66,9 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum Backend<'a, T, D> {
+pub enum Backend<'a, T> {
    /// Cloud API (V2).
    ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T),
-    /// Authentication via a web browser.
-    ConsoleRedirect(MaybeOwned<'a, url::ApiUrl>, D),
    /// Local proxy uses configured auth credentials and does not wake compute
    Local(MaybeOwned<'a, LocalBackend>),
 }
@@ -90,7 +89,7 @@ impl Clone for Box<dyn TestBackend> {
    }
 }

-impl std::fmt::Display for Backend<'_, (), ()> {
+impl std::fmt::Display for Backend<'_, ()> {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::ControlPlane(api, ()) => match &**api {
@@ -106,46 +105,39 @@ impl std::fmt::Display for Backend<'_, (), ()> {
                #[cfg(test)]
                ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
            },
-            Self::ConsoleRedirect(url, ()) => fmt
-                .debug_tuple("ConsoleRedirect")
-                .field(&url.as_str())
-                .finish(),
            Self::Local(_) => fmt.debug_tuple("Local").finish(),
        }
    }
 }

-impl<T, D> Backend<'_, T, D> {
+impl<T> Backend<'_, T> {
    /// Very similar to [`std::option::Option::as_ref`].
    /// This helps us pass structured config to async tasks.
-    pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> {
+    pub(crate) fn as_ref(&self) -> Backend<'_, &T> {
        match self {
            Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x),
-            Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(MaybeOwned::Borrowed(c), x),
            Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
        }
    }
 }

-impl<'a, T, D> Backend<'a, T, D> {
+impl<'a, T> Backend<'a, T> {
    /// Very similar to [`std::option::Option::map`].
    /// Maps [`Backend<T>`] to [`Backend<R>`] by applying
    /// a function to a contained value.
-    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> {
+    pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> {
        match self {
            Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)),
-            Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(c, x),
            Self::Local(l) => Backend::Local(l),
        }
    }
 }
-impl<'a, T, D, E> Backend<'a, Result<T, E>, D> {
+impl<'a, T, E> Backend<'a, Result<T, E>> {
    /// Very similar to [`std::option::Option::transpose`].
    /// This is most useful for error handling.
-    pub(crate) fn transpose(self) -> Result<Backend<'a, T, D>, E> {
+    pub(crate) fn transpose(self) -> Result<Backend<'a, T>, E> {
        match self {
            Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)),
-            Self::ConsoleRedirect(c, x) => Ok(Backend::ConsoleRedirect(c, x)),
            Self::Local(l) => Ok(Backend::Local(l)),
        }
    }
@@ -175,10 +167,12 @@ impl ComputeUserInfo {
    }
 }

+#[cfg_attr(test, derive(Debug))]
 pub(crate) enum ComputeCredentialKeys {
    #[cfg(any(test, feature = "testing"))]
    Password(Vec<u8>),
    AuthKeys(AuthKeys),
+    JwtPayload(Vec<u8>),
    None,
 }

@@ -239,7 +233,6 @@ impl AuthenticationConfig {
    pub(crate) fn check_rate_limit(
        &self,
        ctx: &RequestMonitoring,
-        config: &AuthenticationConfig,
        secret: AuthSecret,
        endpoint: &EndpointId,
        is_cleartext: bool,
@@ -263,7 +256,7 @@ impl AuthenticationConfig {
        let limit_not_exceeded = self.rate_limiter.check(
            (
                endpoint_int,
-                MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet),
+                MaskedIp::new(ctx.peer_addr(), self.rate_limit_ip_subnet),
            ),
            password_weight,
        );
@@ -337,7 +330,6 @@ async fn auth_quirks(
    let secret = if let Some(secret) = secret {
        config.check_rate_limit(
            ctx,
-            config,
            secret,
            &info.endpoint,
            unauthenticated_password.is_some() || allow_cleartext,
@@ -413,12 +405,11 @@ async fn authenticate_with_secret(
    classic::authenticate(ctx, info, client, config, secret).await
 }

-impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
+impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> {
    /// Get username from the credentials.
    pub(crate) fn get_user(&self) -> &str {
        match self {
            Self::ControlPlane(_, user_info) => &user_info.user,
-            Self::ConsoleRedirect(_, ()) => "web",
            Self::Local(_) => "local",
        }
    }
@@ -432,7 +423,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> auth::Result<Backend<'a, ComputeCredentials, NodeInfo>> {
+    ) -> auth::Result<Backend<'a, ComputeCredentials>> {
        let res = match self {
            Self::ControlPlane(api, user_info) => {
                info!(
@@ -453,14 +444,6 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
                .await?;
                Backend::ControlPlane(api, credentials)
            }
-            // NOTE: this auth backend doesn't use client credentials.
-            Self::ConsoleRedirect(url, ()) => {
-                info!("performing web authentication");
-
-                let info = console_redirect::authenticate(ctx, config, &url, client).await?;
-
-                Backend::ConsoleRedirect(url, info)
-            }
            Self::Local(_) => {
                return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
            }
@@ -471,14 +454,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
    }
 }

-impl Backend<'_, ComputeUserInfo, &()> {
+impl Backend<'_, ComputeUserInfo> {
    pub(crate) async fn get_role_secret(
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        match self {
            Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Self::ConsoleRedirect(_, ()) => Ok(Cached::new_uncached(None)),
            Self::Local(_) => Ok(Cached::new_uncached(None)),
        }
    }
@@ -491,21 +473,19 @@ impl Backend<'_, ComputeUserInfo, &()> {
            Self::ControlPlane(api, user_info) => {
                api.get_allowed_ips_and_secret(ctx, user_info).await
            }
-            Self::ConsoleRedirect(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
            Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
        }
    }
 }

 #[async_trait::async_trait]
-impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
+impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
    async fn wake_compute(
        &self,
        ctx: &RequestMonitoring,
    ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
        match self {
            Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::ConsoleRedirect(_, info) => Ok(Cached::new_uncached(info.clone())),
            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
        }
    }
@@ -513,31 +493,6 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
    fn get_keys(&self) -> &ComputeCredentialKeys {
        match self {
            Self::ControlPlane(_, creds) => &creds.keys,
-            Self::ConsoleRedirect(_, _) => &ComputeCredentialKeys::None,
-            Self::Local(_) => &ComputeCredentialKeys::None,
-        }
-    }
-}
-
-#[async_trait::async_trait]
-impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> {
-    async fn wake_compute(
-        &self,
-        ctx: &RequestMonitoring,
-    ) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
-        match self {
-            Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await,
-            Self::ConsoleRedirect(_, ()) => {
-                unreachable!("web auth flow doesn't support waking the compute")
-            }
-            Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
-        }
-    }
-
-    fn get_keys(&self) -> &ComputeCredentialKeys {
-        match self {
-            Self::ControlPlane(_, creds) => &creds.keys,
-            Self::ConsoleRedirect(_, ()) => &ComputeCredentialKeys::None,
            Self::Local(_) => &ComputeCredentialKeys::None,
        }
    }
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -6,9 +6,12 @@ use compute_api::spec::LocalProxySpec;
 use dashmap::DashMap;
 use futures::future::Either;
 use proxy::{
-    auth::backend::{
-        jwt::JwkCache,
-        local::{LocalBackend, JWKS_ROLE_MAP},
+    auth::{
+        self,
+        backend::{
+            jwt::JwkCache,
+            local::{LocalBackend, JWKS_ROLE_MAP},
+        },
    },
    cancellation::CancellationHandlerMain,
    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
@@ -132,6 +135,7 @@ async fn main() -> anyhow::Result<()> {

    let args = LocalProxyCliArgs::parse();
    let config = build_config(&args)?;
+    let auth_backend = build_auth_backend(&args)?;

    // before we bind to any ports, write the process ID to a file
    // so that compute-ctl can find our process later
@@ -193,6 +197,7 @@ async fn main() -> anyhow::Result<()> {

    let task = serverless::task_main(
        config,
+        auth_backend,
        http_listener,
        shutdown.clone(),
        Arc::new(CancellationHandlerMain::new(
@@ -257,9 +262,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig

    Ok(Box::leak(Box::new(ProxyConfig {
        tls_config: None,
-        auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
-            LocalBackend::new(args.compute),
-        )),
        metric_collection: None,
        allow_self_signed_compute: false,
        http_config,
@@ -286,6 +288,17 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
    })))
 }

+/// auth::Backend is created at proxy startup, and lives forever.
+fn build_auth_backend(
+    args: &LocalProxyCliArgs,
+) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
+    let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
+        LocalBackend::new(args.compute),
+    ));
+
+    Ok(Box::leak(Box::new(auth_backend)))
+}
+
 async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc<Notify>) {
    loop {
        rx.notified().await;
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -10,6 +10,7 @@ use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::AuthRateLimiter;
+use proxy::auth::backend::ConsoleRedirectBackend;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
@@ -311,8 +312,12 @@ async fn main() -> anyhow::Result<()> {

    let args = ProxyCliArgs::parse();
    let config = build_config(&args)?;
+    let auth_backend = build_auth_backend(&args)?;

-    info!("Authentication backend: {}", config.auth_backend);
+    match auth_backend {
+        Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
+        Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
+    };
    info!("Using region: {}", args.aws_region);

    let region_provider =
@@ -459,24 +464,41 @@ async fn main() -> anyhow::Result<()> {
    // client facing tasks. these will exit on error or on cancellation
    // cancellation returns Ok(())
    let mut client_tasks = JoinSet::new();
-    if let Some(proxy_listener) = proxy_listener {
-        client_tasks.spawn(proxy::proxy::task_main(
-            config,
-            proxy_listener,
-            cancellation_token.clone(),
-            cancellation_handler.clone(),
-            endpoint_rate_limiter.clone(),
-        ));
-    }
+    match auth_backend {
+        Either::Left(auth_backend) => {
+            if let Some(proxy_listener) = proxy_listener {
+                client_tasks.spawn(proxy::proxy::task_main(
+                    config,
+                    auth_backend,
+                    proxy_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                    endpoint_rate_limiter.clone(),
+                ));
+            }

-    if let Some(serverless_listener) = serverless_listener {
-        client_tasks.spawn(serverless::task_main(
-            config,
-            serverless_listener,
-            cancellation_token.clone(),
-            cancellation_handler.clone(),
-            endpoint_rate_limiter.clone(),
-        ));
+            if let Some(serverless_listener) = serverless_listener {
+                client_tasks.spawn(serverless::task_main(
+                    config,
+                    auth_backend,
+                    serverless_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                    endpoint_rate_limiter.clone(),
+                ));
+            }
+        }
+        Either::Right(auth_backend) => {
+            if let Some(proxy_listener) = proxy_listener {
+                client_tasks.spawn(proxy::console_redirect_proxy::task_main(
+                    config,
+                    auth_backend,
+                    proxy_listener,
+                    cancellation_token.clone(),
+                    cancellation_handler.clone(),
+                ));
+            }
+        }
    }

    client_tasks.spawn(proxy::context::parquet::worker(
@@ -506,7 +528,7 @@ async fn main() -> anyhow::Result<()> {
        ));
    }

-    if let auth::Backend::ControlPlane(api, _) = &config.auth_backend {
+    if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
        if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api {
            match (redis_notifications_client, regional_redis_client.clone()) {
                (None, None) => {}
@@ -610,73 +632,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        bail!("dynamic rate limiter should be disabled");
    }

-    let auth_backend = match &args.auth_backend {
-        AuthBackendType::Console => {
-            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
-            let project_info_cache_config: ProjectInfoCacheOptions =
-                args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
-
-            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
-            info!(
-                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
-            );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
-            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
-                wake_compute_cache_config,
-                project_info_cache_config,
-                endpoint_cache_config,
-            )));
-
-            let config::ConcurrencyLockOptions {
-                shards,
-                limiter,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
-                "wake_compute_lock",
-                limiter,
-                shards,
-                timeout,
-                epoch,
-                &Metrics::get().wake_compute_lock,
-            )?));
-            tokio::spawn(locks.garbage_collect_worker());
-
-            let url = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(url, http::new_client());
-
-            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
-            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
-            let wake_compute_endpoint_rate_limiter =
-                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-            let api = control_plane::provider::neon::Api::new(
-                endpoint,
-                caches,
-                locks,
-                wake_compute_endpoint_rate_limiter,
-            );
-            let api = control_plane::provider::ControlPlaneBackend::Management(api);
-            auth::Backend::ControlPlane(MaybeOwned::Owned(api), ())
-        }
-
-        AuthBackendType::Web => {
-            let url = args.uri.parse()?;
-            auth::Backend::ConsoleRedirect(MaybeOwned::Owned(url), ())
-        }
-
-        #[cfg(feature = "testing")]
-        AuthBackendType::Postgres => {
-            let url = args.auth_endpoint.parse()?;
-            let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
-            let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
-            auth::Backend::ControlPlane(MaybeOwned::Owned(api), ())
-        }
-    };
-
    let config::ConcurrencyLockOptions {
        shards,
        limiter,
@@ -726,9 +681,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        webauth_confirmation_timeout: args.webauth_confirmation_timeout,
    };

-    let config = Box::leak(Box::new(ProxyConfig {
+    let config = ProxyConfig {
        tls_config,
-        auth_backend,
        metric_collection,
        allow_self_signed_compute: args.allow_self_signed_compute,
        http_config,
@@ -741,13 +695,100 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        connect_to_compute_retry_config: config::RetryConfig::parse(
            &args.connect_to_compute_retry,
        )?,
-    }));
+    };
+
+    let config = Box::leak(Box::new(config));

    tokio::spawn(config.connect_compute_locks.garbage_collect_worker());

    Ok(config)
 }

+/// auth::Backend is created at proxy startup, and lives forever.
+fn build_auth_backend(
+    args: &ProxyCliArgs,
+) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
+    match &args.auth_backend {
+        AuthBackendType::Console => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+            tokio::spawn(locks.garbage_collect_worker());
+
+            let url = args.auth_endpoint.parse()?;
+            let endpoint = http::Endpoint::new(url, http::new_client());
+
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+            let api = control_plane::provider::neon::Api::new(
+                endpoint,
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+            let api = control_plane::provider::ControlPlaneBackend::Management(api);
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
+        #[cfg(feature = "testing")]
+        AuthBackendType::Postgres => {
+            let url = args.auth_endpoint.parse()?;
+            let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
+            let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
+
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
+        AuthBackendType::Web => {
+            let url = args.uri.parse()?;
+            let backend = ConsoleRedirectBackend::new(url);
+
+            let config = Box::leak(Box::new(backend));
+
+            Ok(Either::Right(config))
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::time::Duration;
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,8 +1,5 @@
 use crate::{
-    auth::{
-        self,
-        backend::{jwt::JwkCache, AuthRateLimiter},
-    },
+    auth::backend::{jwt::JwkCache, AuthRateLimiter},
    control_plane::locks::ApiLocks,
    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
    scram::threadpool::ThreadPool,
@@ -29,7 +26,6 @@ use x509_parser::oid_registry;

 pub struct ProxyConfig {
    pub tls_config: Option<TlsConfig>,
-    pub auth_backend: auth::Backend<'static, (), ()>,
    pub metric_collection: Option<MetricCollectionConfig>,
    pub allow_self_signed_compute: bool,
    pub http_config: HttpConfig,
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -0,0 +1,217 @@
+use crate::auth::backend::ConsoleRedirectBackend;
+use crate::config::{ProxyConfig, ProxyProtocolV2};
+use crate::proxy::{
+    prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource,
+};
+use crate::{
+    cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal},
+    context::RequestMonitoring,
+    error::ReportableError,
+    metrics::{Metrics, NumClientConnectionsGuard},
+    protocol2::read_proxy_protocol,
+    proxy::handshake::{handshake, HandshakeData},
+};
+use futures::TryFutureExt;
+use std::sync::Arc;
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info, Instrument};
+
+use crate::proxy::{
+    connect_compute::{connect_to_compute, TcpMechanism},
+    passthrough::ProxyPassthrough,
+};
+
+pub async fn task_main(
+    config: &'static ProxyConfig,
+    backend: &'static ConsoleRedirectBackend,
+    listener: tokio::net::TcpListener,
+    cancellation_token: CancellationToken,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+) -> anyhow::Result<()> {
+    scopeguard::defer! {
+        info!("proxy has shut down");
+    }
+
+    // When set for the server socket, the keepalive setting
+    // will be inherited by all accepted client sockets.
+    socket2::SockRef::from(&listener).set_keepalive(true)?;
+
+    let connections = tokio_util::task::task_tracker::TaskTracker::new();
+
+    while let Some(accept_result) =
+        run_until_cancelled(listener.accept(), &cancellation_token).await
+    {
+        let (socket, peer_addr) = accept_result?;
+
+        let conn_gauge = Metrics::get()
+            .proxy
+            .client_connections
+            .guard(crate::metrics::Protocol::Tcp);
+
+        let session_id = uuid::Uuid::new_v4();
+        let cancellation_handler = Arc::clone(&cancellation_handler);
+
+        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+
+        connections.spawn(async move {
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
+                Err(e) => {
+                    error!("per-client task finished with an error: {e:#}");
+                    return;
+                }
+                Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+                    error!("missing required proxy protocol header");
+                    return;
+                }
+                Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+                    error!("proxy protocol header not supported");
+                    return;
+                }
+                Ok((socket, Some(addr))) => (socket, addr.ip()),
+                Ok((socket, None)) => (socket, peer_addr.ip()),
+            };
+
+            match socket.inner.set_nodelay(true) {
+                Ok(()) => {}
+                Err(e) => {
+                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
+                    return;
+                }
+            };
+
+            let ctx = RequestMonitoring::new(
+                session_id,
+                peer_addr,
+                crate::metrics::Protocol::Tcp,
+                &config.region,
+            );
+            let span = ctx.span();
+
+            let startup = Box::pin(
+                handle_client(
+                    config,
+                    backend,
+                    &ctx,
+                    cancellation_handler,
+                    socket,
+                    conn_gauge,
+                )
+                .instrument(span.clone()),
+            );
+            let res = startup.await;
+
+            match res {
+                Err(e) => {
+                    // todo: log and push to ctx the error kind
+                    ctx.set_error_kind(e.get_error_kind());
+                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                }
+                Ok(None) => {
+                    ctx.set_success();
+                }
+                Ok(Some(p)) => {
+                    ctx.set_success();
+                    ctx.log_connect();
+                    match p.proxy_pass().instrument(span.clone()).await {
+                        Ok(()) => {}
+                        Err(ErrorSource::Client(e)) => {
+                            error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                        }
+                        Err(ErrorSource::Compute(e)) => {
+                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
+                        }
+                    }
+                }
+            }
+        });
+    }
+
+    connections.close();
+    drop(listener);
+
+    // Drain connections
+    connections.wait().await;
+
+    Ok(())
+}
+
+pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
+    config: &'static ProxyConfig,
+    backend: &'static ConsoleRedirectBackend,
+    ctx: &RequestMonitoring,
+    cancellation_handler: Arc<CancellationHandlerMain>,
+    stream: S,
+    conn_gauge: NumClientConnectionsGuard<'static>,
+) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
+    info!(
+        protocol = %ctx.protocol(),
+        "handling interactive connection from client"
+    );
+
+    let metrics = &Metrics::get().proxy;
+    let proto = ctx.protocol();
+    let request_gauge = metrics.connection_requests.guard(proto);
+
+    let tls = config.tls_config.as_ref();
+
+    let record_handshake_error = !ctx.has_private_peer_addr();
+    let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
+    let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
+    let (mut stream, params) =
+        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
+            HandshakeData::Startup(stream, params) => (stream, params),
+            HandshakeData::Cancel(cancel_key_data) => {
+                return Ok(cancellation_handler
+                    .cancel_session(cancel_key_data, ctx.session_id())
+                    .await
+                    .map(|()| None)?)
+            }
+        };
+    drop(pause);
+
+    ctx.set_db_options(params.clone());
+
+    let user_info = match backend
+        .authenticate(ctx, &config.authentication_config, &mut stream)
+        .await
+    {
+        Ok(auth_result) => auth_result,
+        Err(e) => {
+            return stream.throw_error(e).await?;
+        }
+    };
+
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism {
+            params: &params,
+            locks: &config.connect_compute_locks,
+        },
+        &user_info,
+        config.allow_self_signed_compute,
+        config.wake_compute_retry_config,
+        config.connect_to_compute_retry_config,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;
+
+    let session = cancellation_handler.get_session();
+    prepare_client_connection(&node, &session, &mut stream).await?;
+
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;
+
+    Ok(Some(ProxyPassthrough {
+        client: stream,
+        aux: node.aux.clone(),
+        compute: node,
+        _req: request_gauge,
+        _conn: conn_gauge,
+        _cancel: session,
+    }))
+}
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -81,12 +81,12 @@ pub(crate) mod errors {
                    Reason::EndpointNotFound => ErrorKind::User,
                    Reason::BranchNotFound => ErrorKind::User,
                    Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
-                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User,
-                    Reason::ActiveTimeQuotaExceeded => ErrorKind::User,
-                    Reason::ComputeTimeQuotaExceeded => ErrorKind::User,
-                    Reason::WrittenDataQuotaExceeded => ErrorKind::User,
-                    Reason::DataTransferQuotaExceeded => ErrorKind::User,
-                    Reason::LogicalSizeQuotaExceeded => ErrorKind::User,
+                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
+                    Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
+                    Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
+                    Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
+                    Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
+                    Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
                    Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
                    Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
                    Reason::RunningOperations => ErrorKind::ControlPlane,
@@ -103,7 +103,7 @@ pub(crate) mod errors {
                        } if error
                            .contains("compute time quota of non-primary branches is exceeded") =>
                        {
-                            crate::error::ErrorKind::User
+                            crate::error::ErrorKind::Quota
                        }
                        ControlPlaneError {
                            http_status_code: http::StatusCode::LOCKED,
@@ -112,7 +112,7 @@ pub(crate) mod errors {
                        } if error.contains("quota exceeded")
                            || error.contains("the limit for current plan reached") =>
                        {
-                            crate::error::ErrorKind::User
+                            crate::error::ErrorKind::Quota
                        }
                        ControlPlaneError {
                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
@@ -309,7 +309,7 @@ impl NodeInfo {
            #[cfg(any(test, feature = "testing"))]
            ComputeCredentialKeys::Password(password) => self.config.password(password),
            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
-            ComputeCredentialKeys::None => &mut self.config,
+            ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config,
        };
    }
 }
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/provider/neon.rs
@@ -22,7 +22,7 @@ use futures::TryFutureExt;
 use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
-use tracing::{debug, error, info, info_span, warn, Instrument};
+use tracing::{debug, info, info_span, warn, Instrument};

 const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");

@@ -456,7 +456,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
    });
    body.http_status_code = status;

-    error!("console responded with an error ({status}): {body:?}");
+    warn!("console responded with an error ({status}): {body:?}");
    Err(ApiError::ControlPlane(body))
 }

--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -49,6 +49,10 @@ pub enum ErrorKind {
    #[label(rename = "serviceratelimit")]
    ServiceRateLimit,

+    /// Proxy quota limit violation
+    #[label(rename = "quota")]
+    Quota,
+
    /// internal errors
    Service,

@@ -70,6 +74,7 @@ impl ErrorKind {
            ErrorKind::ClientDisconnect => "clientdisconnect",
            ErrorKind::RateLimit => "ratelimit",
            ErrorKind::ServiceRateLimit => "serviceratelimit",
+            ErrorKind::Quota => "quota",
            ErrorKind::Service => "service",
            ErrorKind::ControlPlane => "controlplane",
            ErrorKind::Postgres => "postgres",
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,5 +1,5 @@
 use anyhow::{anyhow, bail};
-use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
+use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
 use measured::{text::BufferedTextEncoder, MetricGroup};
 use metrics::NeonMetrics;
 use std::{
@@ -21,7 +21,7 @@ async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, "")
 }

-fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
+fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper0::Body, ApiError> {
    let state = Arc::new(Mutex::new(PrometheusHandler {
        encoder: BufferedTextEncoder::new(),
        metrics,
@@ -45,7 +45,7 @@ pub async fn task_main(

    let service = || RouterService::new(make_router(metrics).build()?);

-    hyper::Server::from_tcp(http_listener)?
+    hyper0::Server::from_tcp(http_listener)?
        .serve(service().map_err(|e| anyhow!(e))?)
        .await?;

--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -9,7 +9,7 @@ use std::time::Duration;
 use anyhow::bail;
 use bytes::Bytes;
 use http_body_util::BodyExt;
-use hyper1::body::Body;
+use hyper::body::Body;
 use serde::de::DeserializeOwned;

 pub(crate) use reqwest::{Request, Response};
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -90,13 +90,12 @@ use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;

-extern crate hyper0 as hyper;
-
 pub mod auth;
 pub mod cache;
 pub mod cancellation;
 pub mod compute;
 pub mod config;
+pub mod console_redirect_proxy;
 pub mod context;
 pub mod control_plane;
 pub mod error;
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -35,7 +35,7 @@ use std::sync::Arc;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, Instrument};
+use tracing::{error, info, warn, Instrument};

 use self::{
    connect_compute::{connect_to_compute, TcpMechanism},
@@ -61,6 +61,7 @@ pub async fn run_until_cancelled<F: std::future::Future>(

 pub async fn task_main(
    config: &'static ProxyConfig,
+    auth_backend: &'static auth::Backend<'static, ()>,
    listener: tokio::net::TcpListener,
    cancellation_token: CancellationToken,
    cancellation_handler: Arc<CancellationHandlerMain>,
@@ -95,15 +96,15 @@ pub async fn task_main(
        connections.spawn(async move {
            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
                Err(e) => {
-                    error!("per-client task finished with an error: {e:#}");
+                    warn!("per-client task finished with an error: {e:#}");
                    return;
                }
                Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
-                    error!("missing required proxy protocol header");
+                    warn!("missing required proxy protocol header");
                    return;
                }
                Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
-                    error!("proxy protocol header not supported");
+                    warn!("proxy protocol header not supported");
                    return;
                }
                Ok((socket, Some(addr))) => (socket, addr.ip()),
@@ -129,6 +130,7 @@ pub async fn task_main(
            let startup = Box::pin(
                handle_client(
                    config,
+                    auth_backend,
                    &ctx,
                    cancellation_handler,
                    socket,
@@ -144,7 +146,7 @@ pub async fn task_main(
                Err(e) => {
                    // todo: log and push to ctx the error kind
                    ctx.set_error_kind(e.get_error_kind());
-                    error!(parent: &span, "per-client task finished with an error: {e:#}");
+                    warn!(parent: &span, "per-client task finished with an error: {e:#}");
                }
                Ok(None) => {
                    ctx.set_success();
@@ -155,7 +157,7 @@ pub async fn task_main(
                    match p.proxy_pass().instrument(span.clone()).await {
                        Ok(()) => {}
                        Err(ErrorSource::Client(e)) => {
-                            error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
+                            warn!(parent: &span, "per-client task finished with an IO error from the client: {e:#}");
                        }
                        Err(ErrorSource::Compute(e)) => {
                            error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}");
@@ -243,8 +245,10 @@ impl ReportableError for ClientRequestError {
    }
 }

+#[allow(clippy::too_many_arguments)]
 pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
+    auth_backend: &'static auth::Backend<'static, ()>,
    ctx: &RequestMonitoring,
    cancellation_handler: Arc<CancellationHandlerMain>,
    stream: S,
@@ -285,8 +289,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let common_names = tls.map(|tls| &tls.common_names);

    // Extract credentials which we're going to use for auth.
-    let result = config
-        .auth_backend
+    let result = auth_backend
        .as_ref()
        .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
        .transpose();
@@ -353,7 +356,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(

 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
-async fn prepare_client_connection<P>(
+pub(crate) async fn prepare_client_connection<P>(
    node: &compute::PostgresConnection,
    session: &cancellation::Session<P>,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -71,7 +71,7 @@ impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
    pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
        let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
        if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
-            tracing::error!(?err, "could not cancel the query in the database");
+            tracing::warn!(?err, "could not cancel the query in the database");
        }
        res
    }
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -552,7 +552,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn

 fn helper_create_connect_info(
    mechanism: &TestConnectMechanism,
-) -> auth::Backend<'static, ComputeCredentials, &()> {
+) -> auth::Backend<'static, ComputeCredentials> {
    let user_info = auth::Backend::ControlPlane(
        MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))),
        ComputeCredentials {
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -7,7 +7,7 @@ use crate::metrics::{
    WakeupFailureKind,
 };
 use crate::proxy::retry::{retry_after, should_retry};
-use hyper1::StatusCode;
+use hyper::StatusCode;
 use tracing::{error, info, warn};

 use super::connect_compute::ComputeConnectBackend;
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -6,7 +6,7 @@ use redis::{
    ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
 };
 use tokio::task::JoinHandle;
-use tracing::{debug, error, info};
+use tracing::{debug, error, info, warn};

 use super::elasticache::CredentialsProvider;

@@ -89,7 +89,7 @@ impl ConnectionWithCredentialsProvider {
                    return Ok(());
                }
                Err(e) => {
-                    error!("Error during PING: {e:?}");
+                    warn!("Error during PING: {e:?}");
                }
            }
        } else {
@@ -121,7 +121,7 @@ impl ConnectionWithCredentialsProvider {
                info!("Connection succesfully established");
            }
            Err(e) => {
-                error!("Connection is broken. Error during PING: {e:?}");
+                warn!("Connection is broken. Error during PING: {e:?}");
            }
        }
        self.con = Some(con);
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -146,7 +146,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                {
                    Ok(()) => {}
                    Err(e) => {
-                        tracing::error!("failed to cancel session: {e}");
+                        tracing::warn!("failed to cancel session: {e}");
                    }
                }
            }
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -3,15 +3,17 @@ use std::{io, sync::Arc, time::Duration};
 use async_trait::async_trait;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
 use tokio::net::{lookup_host, TcpStream};
-use tracing::{field::display, info};
+use tokio_postgres::types::ToSql;
+use tracing::{debug, field::display, info};

 use crate::{
    auth::{
+        self,
        backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo},
        check_peer_addr_is_in_list, AuthError,
    },
    compute,
-    config::{AuthenticationConfig, ProxyConfig},
+    config::ProxyConfig,
    context::RequestMonitoring,
    control_plane::{
        errors::{GetAuthInfoError, WakeComputeError},
@@ -26,18 +28,21 @@ use crate::{
        retry::{CouldRetry, ShouldRetryWakeCompute},
    },
    rate_limiter::EndpointRateLimiter,
-    Host,
+    EndpointId, Host,
 };

 use super::{
    conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool},
    http_conn_pool::{self, poll_http2_client},
+    local_conn_pool::{self, LocalClient, LocalConnPool},
 };

 pub(crate) struct PoolingBackend {
    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool>,
+    pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
    pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
    pub(crate) config: &'static ProxyConfig,
+    pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
    pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 }

@@ -45,18 +50,13 @@ impl PoolingBackend {
    pub(crate) async fn authenticate_with_password(
        &self,
        ctx: &RequestMonitoring,
-        config: &AuthenticationConfig,
        user_info: &ComputeUserInfo,
        password: &[u8],
    ) -> Result<ComputeCredentials, AuthError> {
        let user_info = user_info.clone();
-        let backend = self
-            .config
-            .auth_backend
-            .as_ref()
-            .map(|()| user_info.clone());
+        let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
        let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
-        if config.ip_allowlist_check_enabled
+        if self.config.authentication_config.ip_allowlist_check_enabled
            && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips)
        {
            return Err(AuthError::ip_address_not_allowed(ctx.peer_addr()));
@@ -75,7 +75,6 @@ impl PoolingBackend {
        let secret = match cached_secret.value.clone() {
            Some(secret) => self.config.authentication_config.check_rate_limit(
                ctx,
-                config,
                secret,
                &user_info.endpoint,
                true,
@@ -87,9 +86,13 @@ impl PoolingBackend {
            }
        };
        let ep = EndpointIdInt::from(&user_info.endpoint);
-        let auth_outcome =
-            crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret)
-                .await?;
+        let auth_outcome = crate::auth::validate_password_and_exchange(
+            &self.config.authentication_config.thread_pool,
+            ep,
+            password,
+            secret,
+        )
+        .await?;
        let res = match auth_outcome {
            crate::sasl::Outcome::Success(key) => {
                info!("user successfully authenticated");
@@ -109,13 +112,13 @@ impl PoolingBackend {
    pub(crate) async fn authenticate_with_jwt(
        &self,
        ctx: &RequestMonitoring,
-        config: &AuthenticationConfig,
        user_info: &ComputeUserInfo,
        jwt: String,
-    ) -> Result<(), AuthError> {
-        match &self.config.auth_backend {
+    ) -> Result<ComputeCredentials, AuthError> {
+        match &self.auth_backend {
            crate::auth::Backend::ControlPlane(console, ()) => {
-                config
+                self.config
+                    .authentication_config
                    .jwks_cache
                    .check_jwt(
                        ctx,
@@ -127,13 +130,15 @@ impl PoolingBackend {
                    .await
                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;

-                Ok(())
+                Ok(ComputeCredentials {
+                    info: user_info.clone(),
+                    keys: crate::auth::backend::ComputeCredentialKeys::None,
+                })
            }
-            crate::auth::Backend::ConsoleRedirect(_, ()) => Err(AuthError::auth_failed(
-                "JWT login over web auth proxy is not supported",
-            )),
            crate::auth::Backend::Local(_) => {
-                config
+                let keys = self
+                    .config
+                    .authentication_config
                    .jwks_cache
                    .check_jwt(
                        ctx,
@@ -145,8 +150,10 @@ impl PoolingBackend {
                    .await
                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;

-                // todo: rewrite JWT signature with key shared somehow between local proxy and postgres
-                Ok(())
+                Ok(ComputeCredentials {
+                    info: user_info.clone(),
+                    keys,
+                })
            }
        }
    }
@@ -176,7 +183,7 @@ impl PoolingBackend {
        let conn_id = uuid::Uuid::new_v4();
        tracing::Span::current().record("conn_id", display(conn_id));
        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        let backend = self.config.auth_backend.as_ref().map(|()| keys);
+        let backend = self.auth_backend.as_ref().map(|()| keys);
        crate::proxy::connect_compute::connect_to_compute(
            ctx,
            &TokioMechanism {
@@ -208,14 +215,14 @@ impl PoolingBackend {
        let conn_id = uuid::Uuid::new_v4();
        tracing::Span::current().record("conn_id", display(conn_id));
        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        let backend = self
-            .config
-            .auth_backend
-            .as_ref()
-            .map(|()| ComputeCredentials {
-                info: conn_info.user_info.clone(),
-                keys: crate::auth::backend::ComputeCredentialKeys::None,
-            });
+        let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials {
+            info: ComputeUserInfo {
+                user: conn_info.user_info.user.clone(),
+                endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)),
+                options: conn_info.user_info.options.clone(),
+            },
+            keys: crate::auth::backend::ComputeCredentialKeys::None,
+        });
        crate::proxy::connect_compute::connect_to_compute(
            ctx,
            &HyperMechanism {
@@ -231,6 +238,77 @@ impl PoolingBackend {
        )
        .await
    }
+
+    /// Connect to postgres over localhost.
+    ///
+    /// We expect postgres to be started here, so we won't do any retries.
+    ///
+    /// # Panics
+    ///
+    /// Panics if called with a non-local_proxy backend.
+    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    pub(crate) async fn connect_to_local_postgres(
+        &self,
+        ctx: &RequestMonitoring,
+        conn_info: ConnInfo,
+    ) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> {
+        if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
+            return Ok(client);
+        }
+
+        let conn_id = uuid::Uuid::new_v4();
+        tracing::Span::current().record("conn_id", display(conn_id));
+        info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
+
+        let mut node_info = match &self.auth_backend {
+            auth::Backend::ControlPlane(_, ()) => {
+                unreachable!("only local_proxy can connect to local postgres")
+            }
+            auth::Backend::Local(local) => local.node_info.clone(),
+        };
+
+        let config = node_info
+            .config
+            .user(&conn_info.user_info.user)
+            .dbname(&conn_info.dbname);
+
+        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
+        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        drop(pause);
+
+        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
+
+        let handle = local_conn_pool::poll_client(
+            self.local_pool.clone(),
+            ctx,
+            conn_info,
+            client,
+            connection,
+            conn_id,
+            node_info.aux.clone(),
+        );
+
+        let kid = handle.get_client().get_process_id() as i64;
+        let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk();
+
+        debug!(kid, ?jwk, "setting up backend session state");
+
+        // initiates the auth session
+        handle
+            .get_client()
+            .query(
+                "select auth.init($1, $2);",
+                &[
+                    &kid as &(dyn ToSql + Sync),
+                    &tokio_postgres::types::Json(jwk),
+                ],
+            )
+            .await?;
+
+        info!(?kid, "backend session state init");
+
+        Ok(handle)
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -241,6 +319,8 @@ pub(crate) enum HttpConnError {
    PostgresConnectionError(#[from] tokio_postgres::Error),
    #[error("could not connection to local-proxy in compute")]
    LocalProxyConnectionError(#[from] LocalProxyConnError),
+    #[error("could not parse JWT payload")]
+    JwtPayloadError(serde_json::Error),

    #[error("could not get auth info")]
    GetAuthInfo(#[from] GetAuthInfoError),
@@ -257,7 +337,7 @@ pub(crate) enum LocalProxyConnError {
    #[error("error with connection to local-proxy")]
    Io(#[source] std::io::Error),
    #[error("could not establish h2 connection")]
-    H2(#[from] hyper1::Error),
+    H2(#[from] hyper::Error),
 }

 impl ReportableError for HttpConnError {
@@ -266,6 +346,7 @@ impl ReportableError for HttpConnError {
            HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
            HttpConnError::PostgresConnectionError(p) => p.get_error_kind(),
            HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute,
+            HttpConnError::JwtPayloadError(_) => ErrorKind::User,
            HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
            HttpConnError::AuthError(a) => a.get_error_kind(),
            HttpConnError::WakeCompute(w) => w.get_error_kind(),
@@ -280,6 +361,7 @@ impl UserFacingError for HttpConnError {
            HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
            HttpConnError::PostgresConnectionError(p) => p.to_string(),
            HttpConnError::LocalProxyConnectionError(p) => p.to_string(),
+            HttpConnError::JwtPayloadError(p) => p.to_string(),
            HttpConnError::GetAuthInfo(c) => c.to_string_client(),
            HttpConnError::AuthError(c) => c.to_string_client(),
            HttpConnError::WakeCompute(c) => c.to_string_client(),
@@ -296,6 +378,7 @@ impl CouldRetry for HttpConnError {
            HttpConnError::PostgresConnectionError(e) => e.could_retry(),
            HttpConnError::LocalProxyConnectionError(e) => e.could_retry(),
            HttpConnError::ConnectionClosedAbruptly(_) => false,
+            HttpConnError::JwtPayloadError(_) => false,
            HttpConnError::GetAuthInfo(_) => false,
            HttpConnError::AuthError(_) => false,
            HttpConnError::WakeCompute(_) => false,
@@ -422,8 +505,12 @@ impl ConnectMechanism for HyperMechanism {

        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);

-        // let port = node_info.config.get_ports().first().unwrap_or_else(10432);
-        let res = connect_http2(&host, 10432, timeout).await;
+        let port = *node_info.config.get_ports().first().ok_or_else(|| {
+            HttpConnError::WakeCompute(WakeComputeError::BadComputeAddress(
+                "local-proxy port missing on compute address".into(),
+            ))
+        })?;
+        let res = connect_http2(&host, port, timeout).await;
        drop(pause);
        let (client, connection) = permit.release_result(res)?;

@@ -481,7 +568,7 @@ async fn connect_http2(
        };
    };

-    let (client, connection) = hyper1::client::conn::http2::Builder::new(TokioExecutor::new())
+    let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())
        .timer(TokioTimer::new())
        .keep_alive_interval(Duration::from_secs(20))
        .keep_alive_while_idle(true)
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -1,5 +1,5 @@
 use dashmap::DashMap;
-use hyper1::client::conn::http2;
+use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use rand::Rng;
@@ -18,9 +18,9 @@ use tracing::{info, info_span, Instrument};

 use super::conn_pool::ConnInfo;

-pub(crate) type Send = http2::SendRequest<hyper1::body::Incoming>;
+pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
 pub(crate) type Connect =
-    http2::Connection<TokioIo<TcpStream>, hyper1::body::Incoming, TokioExecutor>;
+    http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;

 #[derive(Clone)]
 struct ConnPoolEntry {
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use utils::http::error::ApiError;

 /// Like [`ApiError::into_response`]
-pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes, hyper1::Error>> {
+pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes, hyper::Error>> {
    match this {
        ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
            format!("{err:#?}"), // use debug printing so that we give the cause
@@ -41,6 +41,10 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes,
            err.to_string(),
            StatusCode::SERVICE_UNAVAILABLE,
        ),
+        ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::TOO_MANY_REQUESTS,
+        ),
        ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
            err.to_string(),
            StatusCode::REQUEST_TIMEOUT,
@@ -67,12 +71,12 @@ impl HttpErrorBody {
    fn response_from_msg_and_status(
        msg: String,
        status: StatusCode,
-    ) -> Response<BoxBody<Bytes, hyper1::Error>> {
+    ) -> Response<BoxBody<Bytes, hyper::Error>> {
        HttpErrorBody { msg }.to_response(status)
    }

    /// Same as [`utils::http::error::HttpErrorBody::to_response`]
-    fn to_response(&self, status: StatusCode) -> Response<BoxBody<Bytes, hyper1::Error>> {
+    fn to_response(&self, status: StatusCode) -> Response<BoxBody<Bytes, hyper::Error>> {
        Response::builder()
            .status(status)
            .header(http::header::CONTENT_TYPE, "application/json")
@@ -90,7 +94,7 @@ impl HttpErrorBody {
 pub(crate) fn json_response<T: Serialize>(
    status: StatusCode,
    data: T,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
    let json = serde_json::to_string(&data)
        .context("Failed to serialize JSON response")
        .map_err(ApiError::InternalServerError)?;
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -0,0 +1,544 @@
+use futures::{future::poll_fn, Future};
+use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
+use p256::ecdsa::{Signature, SigningKey};
+use parking_lot::RwLock;
+use rand::rngs::OsRng;
+use serde_json::Value;
+use signature::Signer;
+use std::task::{ready, Poll};
+use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
+use tokio::time::Instant;
+use tokio_postgres::tls::NoTlsStream;
+use tokio_postgres::types::ToSql;
+use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_util::sync::CancellationToken;
+use typed_json::json;
+
+use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
+use crate::metrics::Metrics;
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::{context::RequestMonitoring, DbName, RoleName};
+
+use tracing::{debug, error, warn, Span};
+use tracing::{info, info_span, Instrument};
+
+use super::backend::HttpConnError;
+use super::conn_pool::{ClientInnerExt, ConnInfo};
+
+struct ConnPoolEntry<C: ClientInnerExt> {
+    conn: ClientInner<C>,
+    _last_access: std::time::Instant,
+}
+
+// /// key id for the pg_session_jwt state
+// static PG_SESSION_JWT_KID: AtomicU64 = AtomicU64::new(1);
+
+// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
+    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
+    total_conns: usize,
+    max_conns: usize,
+    global_pool_size_max_conns: usize,
+}
+
+impl<C: ClientInnerExt> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        pools
+            .get_mut(&db_user)
+            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
+    }
+
+    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
+        let Self {
+            pools, total_conns, ..
+        } = self;
+        if let Some(pool) = pools.get_mut(&db_user) {
+            let old_len = pool.conns.len();
+            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
+            let new_len = pool.conns.len();
+            let removed = old_len - new_len;
+            if removed > 0 {
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .dec_by(removed as i64);
+            }
+            *total_conns -= removed;
+            removed > 0
+        } else {
+            false
+        }
+    }
+
+    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
+        let conn_id = client.conn_id;
+
+        if client.is_closed() {
+            info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because connection is closed");
+            return;
+        }
+        let global_max_conn = pool.read().global_pool_size_max_conns;
+        if pool.read().total_conns >= global_max_conn {
+            info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full");
+            return;
+        }
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < pool.max_conns {
+                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
+                pool_entries.conns.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
+
+                returned = true;
+                per_db_size = pool_entries.conns.len();
+
+                pool.total_conns += 1;
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .inc();
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "local_pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+    }
+}
+
+impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
+    fn drop(&mut self) {
+        if self.total_conns > 0 {
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.total_conns as i64);
+        }
+    }
+}
+
+pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
+    conns: Vec<ConnPoolEntry<C>>,
+}
+
+impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
+    fn default() -> Self {
+        Self { conns: Vec::new() }
+    }
+}
+
+impl<C: ClientInnerExt> DbUserConnPool<C> {
+    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
+        let old_len = self.conns.len();
+
+        self.conns.retain(|conn| !conn.conn.is_closed());
+
+        let new_len = self.conns.len();
+        let removed = old_len - new_len;
+        *conns -= removed;
+        removed
+    }
+
+    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry<C>> {
+        let mut removed = self.clear_closed_clients(conns);
+        let conn = self.conns.pop();
+        if conn.is_some() {
+            *conns -= 1;
+            removed += 1;
+        }
+        Metrics::get()
+            .proxy
+            .http_pool_opened_connections
+            .get_metric()
+            .dec_by(removed as i64);
+        conn
+    }
+}
+
+pub(crate) struct LocalConnPool<C: ClientInnerExt> {
+    global_pool: RwLock<EndpointConnPool<C>>,
+
+    config: &'static crate::config::HttpConfig,
+}
+
+impl<C: ClientInnerExt> LocalConnPool<C> {
+    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+        Arc::new(Self {
+            global_pool: RwLock::new(EndpointConnPool {
+                pools: HashMap::new(),
+                total_conns: 0,
+                max_conns: config.pool_options.max_conns_per_endpoint,
+                global_pool_size_max_conns: config.pool_options.max_total_conns,
+            }),
+            config,
+        })
+    }
+
+    pub(crate) fn get_idle_timeout(&self) -> Duration {
+        self.config.pool_options.idle_timeout
+    }
+
+    // pub(crate) fn shutdown(&self) {
+    //     let mut pool = self.global_pool.write();
+    //     pool.pools.clear();
+    //     pool.total_conns = 0;
+    // }
+
+    pub(crate) fn get(
+        self: &Arc<Self>,
+        ctx: &RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Result<Option<LocalClient<C>>, HttpConnError> {
+        let mut client: Option<ClientInner<C>> = None;
+        if let Some(entry) = self
+            .global_pool
+            .write()
+            .get_conn_entry(conn_info.db_and_user())
+        {
+            client = Some(entry.conn);
+        }
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(client) = client {
+            if client.is_closed() {
+                info!("local_pool: cached connection '{conn_info}' is closed, opening a new one");
+                return Ok(None);
+            }
+            tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
+            tracing::Span::current().record(
+                "pid",
+                tracing::field::display(client.inner.get_process_id()),
+            );
+            info!(
+                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                "local_pool: reusing connection '{conn_info}'"
+            );
+            client.session.send(ctx.session_id())?;
+            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+            ctx.success();
+            return Ok(Some(LocalClient::new(
+                client,
+                conn_info.clone(),
+                Arc::downgrade(self),
+            )));
+        }
+        Ok(None)
+    }
+}
+
+pub(crate) fn poll_client(
+    global_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
+    ctx: &RequestMonitoring,
+    conn_info: ConnInfo,
+    client: tokio_postgres::Client,
+    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    conn_id: uuid::Uuid,
+    aux: MetricsAuxInfo,
+) -> LocalClient<tokio_postgres::Client> {
+    let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
+    let mut session_id = ctx.session_id();
+    let (tx, mut rx) = tokio::sync::watch::channel(session_id);
+
+    let span = info_span!(parent: None, "connection", %conn_id);
+    let cold_start_info = ctx.cold_start_info();
+    span.in_scope(|| {
+        info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
+    });
+    let pool = Arc::downgrade(&global_pool);
+    let pool_clone = pool.clone();
+
+    let db_user = conn_info.db_and_user();
+    let idle = global_pool.get_idle_timeout();
+    let cancel = CancellationToken::new();
+    let cancelled = cancel.clone().cancelled_owned();
+
+    tokio::spawn(
+    async move {
+        let _conn_gauge = conn_gauge;
+        let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        let mut cancelled = pin!(cancelled);
+
+        poll_fn(move |cx| {
+            if cancelled.as_mut().poll(cx).is_ready() {
+                info!("connection dropped");
+                return Poll::Ready(())
+            }
+
+            match rx.has_changed() {
+                Ok(true) => {
+                    session_id = *rx.borrow_and_update();
+                    info!(%session_id, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+                Err(_) => {
+                    info!("connection dropped");
+                    return Poll::Ready(())
+                }
+                _ => {}
+            }
+
+            // 5 minute idle connection timeout
+            if idle_timeout.as_mut().poll(cx).is_ready() {
+                idle_timeout.as_mut().reset(Instant::now() + idle);
+                info!("connection idle");
+                if let Some(pool) = pool.clone().upgrade() {
+                    // remove client from pool - should close the connection if it's idle.
+                    // does nothing if the client is currently checked-out and in-use
+                    if pool.global_pool.write().remove_client(db_user.clone(), conn_id) {
+                        info!("idle connection removed");
+                    }
+                }
+            }
+
+            loop {
+                let message = ready!(connection.poll_message(cx));
+
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session_id, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session_id, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session_id, "connection error: {}", e);
+                        break
+                    }
+                    None => {
+                        info!("connection closed");
+                        break
+                    }
+                }
+            }
+
+            // remove from connection pool
+            if let Some(pool) = pool.clone().upgrade() {
+                if pool.global_pool.write().remove_client(db_user.clone(), conn_id) {
+                    info!("closed connection removed");
+                }
+            }
+
+            Poll::Ready(())
+        }).await;
+
+    }
+    .instrument(span));
+
+    let key = SigningKey::random(&mut OsRng);
+
+    let inner = ClientInner {
+        inner: client,
+        session: tx,
+        cancel,
+        aux,
+        conn_id,
+        key,
+        jti: 0,
+    };
+    LocalClient::new(inner, conn_info, pool_clone)
+}
+
+struct ClientInner<C: ClientInnerExt> {
+    inner: C,
+    session: tokio::sync::watch::Sender<uuid::Uuid>,
+    cancel: CancellationToken,
+    aux: MetricsAuxInfo,
+    conn_id: uuid::Uuid,
+
+    // needed for pg_session_jwt state
+    key: SigningKey,
+    jti: u64,
+}
+
+impl<C: ClientInnerExt> Drop for ClientInner<C> {
+    fn drop(&mut self) {
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
+    }
+}
+
+impl<C: ClientInnerExt> ClientInner<C> {
+    pub(crate) fn is_closed(&self) -> bool {
+        self.inner.is_closed()
+    }
+}
+
+impl<C: ClientInnerExt> LocalClient<C> {
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.as_ref().unwrap().aux;
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
+        })
+    }
+}
+
+pub(crate) struct LocalClient<C: ClientInnerExt> {
+    span: Span,
+    inner: Option<ClientInner<C>>,
+    conn_info: ConnInfo,
+    pool: Weak<LocalConnPool<C>>,
+}
+
+pub(crate) struct Discard<'a, C: ClientInnerExt> {
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<LocalConnPool<C>>,
+}
+
+impl<C: ClientInnerExt> LocalClient<C> {
+    pub(self) fn new(
+        inner: ClientInner<C>,
+        conn_info: ConnInfo,
+        pool: Weak<LocalConnPool<C>>,
+    ) -> Self {
+        Self {
+            inner: Some(inner),
+            span: Span::current(),
+            conn_info,
+            pool,
+        }
+    }
+    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner = inner.as_mut().expect("client inner should not be removed");
+        (&mut inner.inner, Discard { conn_info, pool })
+    }
+    pub(crate) fn key(&self) -> &SigningKey {
+        let inner = &self
+            .inner
+            .as_ref()
+            .expect("client inner should not be removed");
+        &inner.key
+    }
+}
+
+impl LocalClient<tokio_postgres::Client> {
+    pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
+        let inner = self
+            .inner
+            .as_mut()
+            .expect("client inner should not be removed");
+        inner.jti += 1;
+
+        let kid = inner.inner.get_process_id();
+        let header = json!({"kid":kid}).to_string();
+
+        let mut payload = serde_json::from_slice::<serde_json::Map<String, Value>>(payload)
+            .map_err(HttpConnError::JwtPayloadError)?;
+        payload.insert("jti".to_string(), Value::Number(inner.jti.into()));
+        let payload = Value::Object(payload).to_string();
+
+        debug!(
+            kid,
+            jti = inner.jti,
+            ?header,
+            ?payload,
+            "signing new ephemeral JWT"
+        );
+
+        let token = sign_jwt(&inner.key, header, payload);
+
+        // initiates the auth session
+        inner.inner.simple_query("discard all").await?;
+        inner
+            .inner
+            .query(
+                "select auth.jwt_session_init($1)",
+                &[&token as &(dyn ToSql + Sync)],
+            )
+            .await?;
+
+        info!(kid, jti = inner.jti, "user session state init");
+
+        Ok(())
+    }
+}
+
+fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String {
+    let header = Base64UrlUnpadded::encode_string(header.as_bytes());
+    let payload = Base64UrlUnpadded::encode_string(payload.as_bytes());
+
+    let message = format!("{header}.{payload}");
+    let sig: Signature = sk.sign(message.as_bytes());
+    let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes());
+    format!("{message}.{base64_sig}")
+}
+
+impl<C: ClientInnerExt> Discard<'_, C> {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!(
+                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
+            );
+        }
+    }
+    pub(crate) fn discard(&mut self) {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
+            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+        }
+    }
+}
+
+impl<C: ClientInnerExt> LocalClient<C> {
+    pub fn get_client(&self) -> &C {
+        &self
+            .inner
+            .as_ref()
+            .expect("client inner should not be removed")
+            .inner
+    }
+
+    fn do_drop(&mut self) -> Option<impl FnOnce()> {
+        let conn_info = self.conn_info.clone();
+        let client = self
+            .inner
+            .take()
+            .expect("client inner should not be removed");
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
+            let current_span = self.span.clone();
+            // return connection to the pool
+            return Some(move || {
+                let _span = current_span.enter();
+                EndpointConnPool::put(&conn_pool.global_pool, &conn_info, client);
+            });
+        }
+        None
+    }
+}
+
+impl<C: ClientInnerExt> Drop for LocalClient<C> {
+    fn drop(&mut self) {
+        if let Some(drop) = self.do_drop() {
+            tokio::task::spawn_blocking(drop);
+        }
+    }
+}
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -8,6 +8,7 @@ mod conn_pool;
 mod http_conn_pool;
 mod http_util;
 mod json;
+mod local_conn_pool;
 mod sql_over_http;
 mod websocket;

@@ -22,7 +23,7 @@ use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Empty};
-use hyper1::body::Incoming;
+use hyper::body::Incoming;
 use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
@@ -47,13 +48,14 @@ use std::pin::{pin, Pin};
 use std::sync::Arc;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn, Instrument};
+use tracing::{info, warn, Instrument};
 use utils::http::error::ApiError;

 pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";

 pub async fn task_main(
    config: &'static ProxyConfig,
+    auth_backend: &'static crate::auth::Backend<'static, ()>,
    ws_listener: TcpListener,
    cancellation_token: CancellationToken,
    cancellation_handler: Arc<CancellationHandlerMain>,
@@ -63,6 +65,7 @@ pub async fn task_main(
        info!("websocket server has shut down");
    }

+    let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config);
    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
    {
        let conn_pool = Arc::clone(&conn_pool);
@@ -105,8 +108,10 @@ pub async fn task_main(

    let backend = Arc::new(PoolingBackend {
        http_conn_pool: Arc::clone(&http_conn_pool),
+        local_pool,
        pool: Arc::clone(&conn_pool),
        config,
+        auth_backend,
        endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter),
    });
    let tls_acceptor: Arc<dyn MaybeTlsAcceptor> = match config.tls_config.as_ref() {
@@ -238,7 +243,7 @@ async fn connection_startup(
    let (conn, peer) = match read_proxy_protocol(conn).await {
        Ok(c) => c,
        Err(e) => {
-            tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
+            tracing::warn!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
            return None;
        }
    };
@@ -302,7 +307,7 @@ async fn connection_handler(
    let server = Builder::new(TokioExecutor::new());
    let conn = server.serve_connection_with_upgrades(
        hyper_util::rt::TokioIo::new(conn),
-        hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
+        hyper::service::service_fn(move |req: hyper::Request<Incoming>| {
            // First HTTP request shares the same session ID
            let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);

@@ -355,7 +360,7 @@ async fn connection_handler(

 #[allow(clippy::too_many_arguments)]
 async fn request_handler(
-    mut request: hyper1::Request<Incoming>,
+    mut request: hyper::Request<Incoming>,
    config: &'static ProxyConfig,
    backend: Arc<PoolingBackend>,
    ws_connections: TaskTracker,
@@ -365,7 +370,7 @@ async fn request_handler(
    // used to cancel in-flight HTTP requests. not used to cancel websockets
    http_cancellation_token: CancellationToken,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
    let host = request
        .headers()
        .get("host")
@@ -394,6 +399,7 @@ async fn request_handler(
            async move {
                if let Err(e) = websocket::serve_websocket(
                    config,
+                    backend.auth_backend,
                    ctx,
                    websocket,
                    cancellation_handler,
@@ -402,7 +408,7 @@ async fn request_handler(
                )
                .await
                {
-                    error!("error in websocket connection: {e:#}");
+                    warn!("error in websocket connection: {e:#}");
                }
            }
            .instrument(span),
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -12,14 +12,14 @@ use http::Method;
 use http_body_util::combinators::BoxBody;
 use http_body_util::BodyExt;
 use http_body_util::Full;
-use hyper1::body::Body;
-use hyper1::body::Incoming;
-use hyper1::header;
-use hyper1::http::HeaderName;
-use hyper1::http::HeaderValue;
-use hyper1::Response;
-use hyper1::StatusCode;
-use hyper1::{HeaderMap, Request};
+use hyper::body::Body;
+use hyper::body::Incoming;
+use hyper::header;
+use hyper::http::HeaderName;
+use hyper::http::HeaderValue;
+use hyper::Response;
+use hyper::StatusCode;
+use hyper::{HeaderMap, Request};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
@@ -40,11 +40,12 @@ use url::Url;
 use urlencoding;
 use utils::http::error::ApiError;

-use crate::auth::backend::ComputeCredentials;
+use crate::auth::backend::ComputeCredentialKeys;
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::endpoint_sni;
 use crate::auth::ComputeUserInfoParseError;
 use crate::config::AuthenticationConfig;
+use crate::config::HttpConfig;
 use crate::config::ProxyConfig;
 use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
@@ -56,20 +57,22 @@ use crate::metrics::Metrics;
 use crate::proxy::run_until_cancelled;
 use crate::proxy::NeonOptions;
 use crate::serverless::backend::HttpConnError;
+use crate::usage_metrics::MetricCounter;
 use crate::usage_metrics::MetricCounterRecorder;
 use crate::DbName;
 use crate::RoleName;

 use super::backend::LocalProxyConnError;
 use super::backend::PoolingBackend;
+use super::conn_pool;
 use super::conn_pool::AuthData;
-use super::conn_pool::Client;
 use super::conn_pool::ConnInfo;
 use super::conn_pool::ConnInfoWithAuth;
 use super::http_util::json_response;
 use super::json::json_to_pg_text;
 use super::json::pg_text_row_to_json;
 use super::json::JsonConversionError;
+use super::local_conn_pool;

 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
@@ -272,7 +275,7 @@ pub(crate) async fn handle(
    request: Request<Incoming>,
    backend: Arc<PoolingBackend>,
    cancel: CancellationToken,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
    let result = handle_inner(cancel, config, &ctx, request, backend).await;

    let mut response = match result {
@@ -435,7 +438,7 @@ impl UserFacingError for SqlOverHttpError {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum ReadPayloadError {
    #[error("could not read the HTTP request body: {0}")]
-    Read(#[from] hyper1::Error),
+    Read(#[from] hyper::Error),
    #[error("could not parse the HTTP request body: {0}")]
    Parse(#[from] serde_json::Error),
 }
@@ -476,7 +479,7 @@ struct HttpHeaders {
 }

 impl HttpHeaders {
-    fn try_parse(headers: &hyper1::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
+    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
        // Determine the output options. Default behaviour is 'false'. Anything that is not
        // strictly 'true' assumed to be false.
        let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
@@ -529,7 +532,7 @@ async fn handle_inner(
    ctx: &RequestMonitoring,
    request: Request<Incoming>,
    backend: Arc<PoolingBackend>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
    let _requeset_gauge = Metrics::get()
        .proxy
        .connection_requests
@@ -552,7 +555,7 @@ async fn handle_inner(

    match conn_info.auth {
        AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => {
-            handle_auth_broker_inner(config, ctx, request, conn_info.conn_info, jwt, backend).await
+            handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, backend).await
        }
        auth => {
            handle_db_inner(
@@ -577,7 +580,7 @@ async fn handle_db_inner(
    conn_info: ConnInfo,
    auth: AuthData,
    backend: Arc<PoolingBackend>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
    //
    // Determine the destination and connection params
    //
@@ -620,37 +623,35 @@ async fn handle_db_inner(

    let authenticate_and_connect = Box::pin(
        async {
+            let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_));
+
            let keys = match auth {
                AuthData::Password(pw) => {
                    backend
-                        .authenticate_with_password(
-                            ctx,
-                            &config.authentication_config,
-                            &conn_info.user_info,
-                            &pw,
-                        )
+                        .authenticate_with_password(ctx, &conn_info.user_info, &pw)
                        .await?
                }
                AuthData::Jwt(jwt) => {
                    backend
-                        .authenticate_with_jwt(
-                            ctx,
-                            &config.authentication_config,
-                            &conn_info.user_info,
-                            jwt,
-                        )
-                        .await?;
-
-                    ComputeCredentials {
-                        info: conn_info.user_info.clone(),
-                        keys: crate::auth::backend::ComputeCredentialKeys::None,
-                    }
+                        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
+                        .await?
+                }
+            };
+
+            let client = match keys.keys {
+                ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => {
+                    let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?;
+                    client.set_jwt_session(&payload).await?;
+                    Client::Local(client)
+                }
+                _ => {
+                    let client = backend
+                        .connect_to_compute(ctx, conn_info, keys, !allow_pool)
+                        .await?;
+                    Client::Remote(client)
                }
            };

-            let client = backend
-                .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-                .await?;
            // not strictly necessary to mark success here,
            // but it's just insurance for if we forget it somewhere else
            ctx.success();
@@ -680,7 +681,7 @@ async fn handle_db_inner(
    // Now execute the query and return the result.
    let json_output = match payload {
        Payload::Single(stmt) => {
-            stmt.process(config, cancel, &mut client, parsed_headers)
+            stmt.process(&config.http_config, cancel, &mut client, parsed_headers)
                .await?
        }
        Payload::Batch(statements) => {
@@ -698,7 +699,7 @@ async fn handle_db_inner(
            }

            statements
-                .process(config, cancel, &mut client, parsed_headers)
+                .process(&config.http_config, cancel, &mut client, parsed_headers)
                .await?
        }
    };
@@ -738,20 +739,14 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[
 ];

 async fn handle_auth_broker_inner(
-    config: &'static ProxyConfig,
    ctx: &RequestMonitoring,
    request: Request<Incoming>,
    conn_info: ConnInfo,
    jwt: String,
    backend: Arc<PoolingBackend>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
    backend
-        .authenticate_with_jwt(
-            ctx,
-            &config.authentication_config,
-            &conn_info.user_info,
-            jwt,
-        )
+        .authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
        .await
        .map_err(HttpConnError::from)?;

@@ -789,9 +784,9 @@ async fn handle_auth_broker_inner(
 impl QueryData {
    async fn process(
        self,
-        config: &'static ProxyConfig,
+        config: &'static HttpConfig,
        cancel: CancellationToken,
-        client: &mut Client<tokio_postgres::Client>,
+        client: &mut Client,
        parsed_headers: HttpHeaders,
    ) -> Result<String, SqlOverHttpError> {
        let (inner, mut discard) = client.inner();
@@ -820,7 +815,7 @@ impl QueryData {
            Either::Right((_cancelled, query)) => {
                tracing::info!("cancelling query");
                if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                    tracing::error!(?err, "could not cancel query");
+                    tracing::warn!(?err, "could not cancel query");
                }
                // wait for the query cancellation
                match time::timeout(time::Duration::from_millis(100), query).await {
@@ -863,9 +858,9 @@ impl QueryData {
 impl BatchQueryData {
    async fn process(
        self,
-        config: &'static ProxyConfig,
+        config: &'static HttpConfig,
        cancel: CancellationToken,
-        client: &mut Client<tokio_postgres::Client>,
+        client: &mut Client,
        parsed_headers: HttpHeaders,
    ) -> Result<String, SqlOverHttpError> {
        info!("starting transaction");
@@ -909,7 +904,7 @@ impl BatchQueryData {
            }
            Err(SqlOverHttpError::Cancelled(_)) => {
                if let Err(err) = cancel_token.cancel_query(NoTls).await {
-                    tracing::error!(?err, "could not cancel query");
+                    tracing::warn!(?err, "could not cancel query");
                }
                // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe.
                discard.discard();
@@ -933,7 +928,7 @@ impl BatchQueryData {
 }

 async fn query_batch(
-    config: &'static ProxyConfig,
+    config: &'static HttpConfig,
    cancel: CancellationToken,
    transaction: &Transaction<'_>,
    queries: BatchQueryData,
@@ -972,7 +967,7 @@ async fn query_batch(
 }

 async fn query_to_json<T: GenericClient>(
-    config: &'static ProxyConfig,
+    config: &'static HttpConfig,
    client: &T,
    data: QueryData,
    current_size: &mut usize,
@@ -993,9 +988,9 @@ async fn query_to_json<T: GenericClient>(
        rows.push(row);
        // we don't have a streaming response support yet so this is to prevent OOM
        // from a malicious query (eg a cross join)
-        if *current_size > config.http_config.max_response_size_bytes {
+        if *current_size > config.max_response_size_bytes {
            return Err(SqlOverHttpError::ResponseTooLarge(
-                config.http_config.max_response_size_bytes,
+                config.max_response_size_bytes,
            ));
        }
    }
@@ -1058,3 +1053,50 @@ async fn query_to_json<T: GenericClient>(

    Ok((ready, results))
 }
+
+enum Client {
+    Remote(conn_pool::Client<tokio_postgres::Client>),
+    Local(local_conn_pool::LocalClient<tokio_postgres::Client>),
+}
+
+enum Discard<'a> {
+    Remote(conn_pool::Discard<'a, tokio_postgres::Client>),
+    Local(local_conn_pool::Discard<'a, tokio_postgres::Client>),
+}
+
+impl Client {
+    fn metrics(&self) -> Arc<MetricCounter> {
+        match self {
+            Client::Remote(client) => client.metrics(),
+            Client::Local(local_client) => local_client.metrics(),
+        }
+    }
+
+    fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
+        match self {
+            Client::Remote(client) => {
+                let (c, d) = client.inner();
+                (c, Discard::Remote(d))
+            }
+            Client::Local(local_client) => {
+                let (c, d) = local_client.inner();
+                (c, Discard::Local(d))
+            }
+        }
+    }
+}
+
+impl Discard<'_> {
+    fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        match self {
+            Discard::Remote(discard) => discard.check_idle(status),
+            Discard::Local(discard) => discard.check_idle(status),
+        }
+    }
+    fn discard(&mut self) {
+        match self {
+            Discard::Remote(discard) => discard.discard(),
+            Discard::Local(discard) => discard.discard(),
+        }
+    }
+}
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -12,7 +12,7 @@ use anyhow::Context as _;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use framed_websockets::{Frame, OpCode, WebSocketServer};
 use futures::{Sink, Stream};
-use hyper1::upgrade::OnUpgrade;
+use hyper::upgrade::OnUpgrade;
 use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;

@@ -129,6 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {

 pub(crate) async fn serve_websocket(
    config: &'static ProxyConfig,
+    auth_backend: &'static crate::auth::Backend<'static, ()>,
    ctx: RequestMonitoring,
    websocket: OnUpgrade,
    cancellation_handler: Arc<CancellationHandlerMain>,
@@ -145,6 +146,7 @@ pub(crate) async fn serve_websocket(

    let res = Box::pin(handle_client(
        config,
+        auth_backend,
        &ctx,
        cancellation_handler,
        WebSocketRw::new(websocket),
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -27,7 +27,7 @@ use std::{
 };
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, instrument, trace};
+use tracing::{error, info, instrument, trace, warn};
 use utils::backoff;
 use uuid::{NoContext, Timestamp};

@@ -346,7 +346,7 @@ async fn collect_metrics_iteration(
            error!("metrics endpoint refused the sent metrics: {:?}", res);
            for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
                // Report if the metric value is suspiciously large
-                error!("potentially abnormal metric value: {:?}", metric);
+                warn!("potentially abnormal metric value: {:?}", metric);
            }
        }
    }
@@ -485,49 +485,51 @@ async fn upload_events_chunk(

 #[cfg(test)]
 mod tests {
-    use std::{
-        net::TcpListener,
-        sync::{Arc, Mutex},
-    };
+    use super::*;

+    use crate::{http, BranchId, EndpointId};
    use anyhow::Error;
    use chrono::Utc;
    use consumption_metrics::{Event, EventChunk};
-    use hyper::{
-        service::{make_service_fn, service_fn},
-        Body, Response,
-    };
+    use http_body_util::BodyExt;
+    use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response};
+    use hyper_util::rt::TokioIo;
+    use std::sync::{Arc, Mutex};
+    use tokio::net::TcpListener;
    use url::Url;

-    use super::*;
-    use crate::{http, BranchId, EndpointId};
-
    #[tokio::test]
    async fn metrics() {
-        let listener = TcpListener::bind("0.0.0.0:0").unwrap();
+        type Report = EventChunk<'static, Event<Ids, String>>;
+        let reports: Arc<Mutex<Vec<Report>>> = Arc::default();

-        let reports = Arc::new(Mutex::new(vec![]));
-        let reports2 = reports.clone();
-
-        let server = hyper::server::Server::from_tcp(listener)
-            .unwrap()
-            .serve(make_service_fn(move |_| {
-                let reports = reports.clone();
-                async move {
-                    Ok::<_, Error>(service_fn(move |req| {
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn({
+            let reports = reports.clone();
+            async move {
+                loop {
+                    if let Ok((stream, _addr)) = listener.accept().await {
                        let reports = reports.clone();
-                        async move {
-                            let bytes = hyper::body::to_bytes(req.into_body()).await?;
-                            let events: EventChunk<'static, Event<Ids, String>> =
-                                serde_json::from_slice(&bytes)?;
-                            reports.lock().unwrap().push(events);
-                            Ok::<_, Error>(Response::new(Body::from(vec![])))
-                        }
-                    }))
+                        http1::Builder::new()
+                            .serve_connection(
+                                TokioIo::new(stream),
+                                service_fn(move |req: Request<Incoming>| {
+                                    let reports = reports.clone();
+                                    async move {
+                                        let bytes = req.into_body().collect().await?.to_bytes();
+                                        let events = serde_json::from_slice(&bytes)?;
+                                        reports.lock().unwrap().push(events);
+                                        Ok::<_, Error>(Response::new(String::new()))
+                                    }
+                                }),
+                            )
+                            .await
+                            .unwrap();
+                    }
                }
-            }));
-        let addr = server.local_addr();
-        tokio::spawn(server);
+            }
+        });

        let metrics = Metrics::default();
        let client = http::new_client();
@@ -536,7 +538,7 @@ mod tests {

        // no counters have been registered
        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
        assert!(r.is_empty());

        // register a new counter
@@ -548,7 +550,7 @@ mod tests {

        // the counter should be observed despite 0 egress
        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].events.len(), 1);
        assert_eq!(r[0].events[0].value, 0);
@@ -558,7 +560,7 @@ mod tests {

        // egress should be observered
        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
        assert_eq!(r.len(), 1);
        assert_eq!(r[0].events.len(), 1);
        assert_eq!(r[0].events[0].value, 1);
@@ -568,7 +570,7 @@ mod tests {

        // we do not observe the counter
        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
        assert!(r.is_empty());

        // counter is unregistered
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -15,15 +15,20 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
            }
            Ok(())
        }
-        (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => {
-            Err(AuthError(
-                format!(
-                    "JWT scope '{:?}' is ineligible for Safekeeper auth",
-                    claims.scope
-                )
-                .into(),
-            ))
-        }
+        (
+            Scope::Admin
+            | Scope::PageServerApi
+            | Scope::GenerationsApi
+            | Scope::Infra
+            | Scope::Scrubber,
+            _,
+        ) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Safekeeper auth",
+                claims.scope
+            )
+            .into(),
+        )),
        (Scope::SafekeeperData, _) => Ok(()),
    }
 }
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -12,8 +12,8 @@ use metrics::{
    core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
    proto::MetricFamily,
    register_histogram_vec, register_int_counter, register_int_counter_pair,
-    register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
-    IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge,
+    HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 use once_cell::sync::Lazy;

@@ -231,6 +231,14 @@ pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(||
    .expect("Failed to register metric")
 });

+pub static NUM_EVICTED_TIMELINES: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "safekeeper_evicted_timelines",
+        "Number of currently evicted timelines"
+    )
+    .expect("Failed to register metric")
+});
+
 pub const LABEL_UNKNOWN: &str = "unknown";

 /// Labels for traffic metrics.
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -631,13 +631,19 @@ impl Timeline {

            return Err(e);
        }
-        self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter);
+        self.bootstrap(
+            shared_state,
+            conf,
+            broker_active_set,
+            partial_backup_rate_limiter,
+        );
        Ok(())
    }

    /// Bootstrap new or existing timeline starting background tasks.
    pub fn bootstrap(
        self: &Arc<Timeline>,
+        _shared_state: &mut WriteGuardSharedState<'_>,
        conf: &SafeKeeperConf,
        broker_active_set: Arc<TimelinesSet>,
        partial_backup_rate_limiter: RateLimiter,
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -15,7 +15,9 @@ use tracing::{debug, info, instrument, warn};
 use utils::crashsafe::durable_rename;

 use crate::{
-    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
+    metrics::{
+        EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
+    },
    rate_limit::rand_duration,
    timeline_manager::{Manager, StateSnapshot},
    wal_backup,
@@ -93,6 +95,7 @@ impl Manager {
        }

        info!("successfully evicted timeline");
+        NUM_EVICTED_TIMELINES.inc();
    }

    /// Attempt to restore evicted timeline from remote storage; it must be
@@ -128,6 +131,7 @@ impl Manager {
            tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);

        info!("successfully restored evicted timeline");
+        NUM_EVICTED_TIMELINES.dec();
    }
 }

--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -25,7 +25,10 @@ use utils::lsn::Lsn;

 use crate::{
    control_file::{FileStorage, Storage},
-    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
+    metrics::{
+        MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS,
+        NUM_EVICTED_TIMELINES,
+    },
    rate_limit::{rand_duration, RateLimiter},
    recovery::recovery_main,
    remove_wal::calc_horizon_lsn,
@@ -251,6 +254,11 @@ pub async fn main_task(
        mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
    }

+    // If timeline is evicted, reflect that in the metric.
+    if mgr.is_offloaded {
+        NUM_EVICTED_TIMELINES.inc();
+    }
+
    let last_state = 'outer: loop {
        MANAGER_ITERATIONS_TOTAL.inc();

@@ -367,6 +375,11 @@ pub async fn main_task(
        mgr.update_wal_removal_end(res);
    }

+    // If timeline is deleted while evicted decrement the gauge.
+    if mgr.tli.is_cancelled() && mgr.is_offloaded {
+        NUM_EVICTED_TIMELINES.dec();
+    }
+
    mgr.set_status(Status::Finished);
 }

--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -165,12 +165,14 @@ impl GlobalTimelines {
                        match Timeline::load_timeline(&conf, ttid) {
                            Ok(timeline) => {
                                let tli = Arc::new(timeline);
+                                let mut shared_state = tli.write_shared_state().await;
                                TIMELINES_STATE
                                    .lock()
                                    .unwrap()
                                    .timelines
                                    .insert(ttid, tli.clone());
                                tli.bootstrap(
+                                    &mut shared_state,
                                    &conf,
                                    broker_active_set.clone(),
                                    partial_backup_rate_limiter.clone(),
@@ -213,6 +215,7 @@ impl GlobalTimelines {
        match Timeline::load_timeline(&conf, ttid) {
            Ok(timeline) => {
                let tli = Arc::new(timeline);
+                let mut shared_state = tli.write_shared_state().await;

                // TODO: prevent concurrent timeline creation/loading
                {
@@ -227,8 +230,13 @@ impl GlobalTimelines {
                    state.timelines.insert(ttid, tli.clone());
                }

-                tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
-
+                tli.bootstrap(
+                    &mut shared_state,
+                    &conf,
+                    broker_active_set,
+                    partial_backup_rate_limiter,
+                );
+                drop(shared_state);
                Ok(tli)
            }
            // If we can't load a timeline, it's bad. Caller will figure it out.
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -17,7 +17,9 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
+use remote_storage::{
+    DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata,
+};
 use tokio::fs::File;

 use tokio::select;
@@ -503,8 +505,12 @@ pub async fn read_object(

    let cancel = CancellationToken::new();

+    let opts = DownloadOpts {
+        byte_start: std::ops::Bound::Included(offset),
+        ..Default::default()
+    };
    let download = storage
-        .download_storage_object(Some((offset, None)), file_path, &cancel)
+        .download(file_path, &opts, &cancel)
        .await
        .with_context(|| {
            format!("Failed to open WAL segment download stream for remote path {file_path:?}")
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -636,7 +636,7 @@ async fn handle_tenant_list(
 }

 async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;

    let mut req = match maybe_forward(req).await {
        ForwardOutcome::Forwarded(res) => {
@@ -1182,7 +1182,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
 /// Assumes information is only relayed to storage controller after first selecting an unique id on
 /// control plane database, which means we have an id field in the request and payload.
 async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;

    let body = json_request::<SafekeeperPersistence>(&mut req).await?;
    let id = parse_request_param::<i64>(&req, "id")?;
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -22,7 +22,7 @@ use utils::sync::gate::GateGuard;

 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
+use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};

 const DEFAULT_HEATMAP_PERIOD: &str = "60s";

@@ -45,8 +45,15 @@ pub(super) struct Reconciler {
    pub(crate) reconciler_config: ReconcilerConfig,

    pub(crate) config: TenantConfig,
+
+    /// Observed state from the point of view of the reconciler.
+    /// This gets updated as the reconciliation makes progress.
    pub(crate) observed: ObservedState,

+    /// Snapshot of the observed state at the point when the reconciler
+    /// was spawned.
+    pub(crate) original_observed: ObservedState,
+
    pub(crate) service_config: service::Config,

    /// A hook to notify the running postgres instances when we change the location
@@ -846,6 +853,39 @@ impl Reconciler {
        }
    }

+    /// Compare the observed state snapshot from when the reconcile was created
+    /// with the final observed state in order to generate observed state deltas.
+    pub(crate) fn observed_deltas(&self) -> Vec<ObservedStateDelta> {
+        let mut deltas = Vec::default();
+
+        for (node_id, location) in &self.observed.locations {
+            let previous_location = self.original_observed.locations.get(node_id);
+            let do_upsert = match previous_location {
+                // Location config changed for node
+                Some(prev) if location.conf != prev.conf => true,
+                // New location config for node
+                None => true,
+                // Location config has not changed for node
+                _ => false,
+            };
+
+            if do_upsert {
+                deltas.push(ObservedStateDelta::Upsert(Box::new((
+                    *node_id,
+                    location.clone(),
+                ))));
+            }
+        }
+
+        for node_id in self.original_observed.locations.keys() {
+            if !self.observed.locations.contains_key(node_id) {
+                deltas.push(ObservedStateDelta::Delete(*node_id));
+            }
+        }
+
+        deltas
+    }
+
    /// Keep trying to notify the compute indefinitely, only dropping out if:
    /// - the node `origin` becomes unavailable -> Ok(())
    /// - the node `origin` no longer has our tenant shard attached -> Ok(())
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -28,8 +28,8 @@ use crate::{
    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
    tenant_shard::{
-        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
-        ScheduleOptimizationAction,
+        MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus,
+        ScheduleOptimization, ScheduleOptimizationAction,
    },
 };
 use anyhow::Context;
@@ -246,6 +246,11 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
            // storage controller's auth configuration.
            ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}"))
        }
+        mgmt_api::Error::ApiError(status @ StatusCode::TOO_MANY_REQUESTS, msg) => {
+            // Pass through 429 errors: if pageserver is asking us to wait + retry, we in
+            // turn ask our clients to wait + retry
+            ApiError::Conflict(format!("{node} {status}: {status} {msg}"))
+        }
        mgmt_api::Error::ApiError(status, msg) => {
            // Presume general case of pageserver API errors is that we tried to do something
            // that can't be done right now.
@@ -1072,7 +1077,7 @@ impl Service {
        tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
        sequence=%result.sequence
    ))]
-    fn process_result(&self, mut result: ReconcileResult) {
+    fn process_result(&self, result: ReconcileResult) {
        let mut locked = self.inner.write().unwrap();
        let (nodes, tenants, _scheduler) = locked.parts_mut();
        let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else {
@@ -1094,22 +1099,27 @@ impl Service {

        // In case a node was deleted while this reconcile is in flight, filter it out of the update we will
        // make to the tenant
-        result
-            .observed
-            .locations
-            .retain(|node_id, _loc| nodes.contains_key(node_id));
+        let deltas = result.observed_deltas.into_iter().flat_map(|delta| {
+            // In case a node was deleted while this reconcile is in flight, filter it out of the update we will
+            // make to the tenant
+            let node = nodes.get(delta.node_id())?;
+
+            if node.is_available() {
+                return Some(delta);
+            }
+
+            // In case a node became unavailable concurrently with the reconcile, observed
+            // locations on it are now uncertain. By convention, set them to None in order
+            // for them to get refreshed when the node comes back online.
+            Some(ObservedStateDelta::Upsert(Box::new((
+                node.get_id(),
+                ObservedStateLocation { conf: None },
+            ))))
+        });

        match result.result {
            Ok(()) => {
-                for (node_id, loc) in &result.observed.locations {
-                    if let Some(conf) = &loc.conf {
-                        tracing::info!("Updating observed location {}: {:?}", node_id, conf);
-                    } else {
-                        tracing::info!("Setting observed location {} to None", node_id,)
-                    }
-                }
-
-                tenant.observed = result.observed;
+                tenant.apply_observed_deltas(deltas);
                tenant.waiter.advance(result.sequence);
            }
            Err(e) => {
@@ -1131,9 +1141,10 @@ impl Service {
                // so that waiters will see the correct error after waiting.
                tenant.set_last_error(result.sequence, e);

-                for (node_id, o) in result.observed.locations {
-                    tenant.observed.locations.insert(node_id, o);
-                }
+                // Skip deletions on reconcile failures
+                let upsert_deltas =
+                    deltas.filter(|delta| matches!(delta, ObservedStateDelta::Upsert(_)));
+                tenant.apply_observed_deltas(upsert_deltas);
            }
        }

--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -425,6 +425,22 @@ pub(crate) enum ReconcileNeeded {
    Yes,
 }

+/// Pending modification to the observed state of a tenant shard.
+/// Produced by [`Reconciler::observed_deltas`] and applied in [`crate::service::Service::process_result`].
+pub(crate) enum ObservedStateDelta {
+    Upsert(Box<(NodeId, ObservedStateLocation)>),
+    Delete(NodeId),
+}
+
+impl ObservedStateDelta {
+    pub(crate) fn node_id(&self) -> &NodeId {
+        match self {
+            Self::Upsert(up) => &up.0,
+            Self::Delete(nid) => nid,
+        }
+    }
+}
+
 /// When a reconcile task completes, it sends this result object
 /// to be applied to the primary TenantShard.
 pub(crate) struct ReconcileResult {
@@ -437,7 +453,7 @@ pub(crate) struct ReconcileResult {

    pub(crate) tenant_shard_id: TenantShardId,
    pub(crate) generation: Option<Generation>,
-    pub(crate) observed: ObservedState,
+    pub(crate) observed_deltas: Vec<ObservedStateDelta>,

    /// Set [`TenantShard::pending_compute_notification`] from this flag
    pub(crate) pending_compute_notification: bool,
@@ -1123,7 +1139,7 @@ impl TenantShard {
            result,
            tenant_shard_id: reconciler.tenant_shard_id,
            generation: reconciler.generation,
-            observed: reconciler.observed,
+            observed_deltas: reconciler.observed_deltas(),
            pending_compute_notification: reconciler.compute_notify_failure,
        }
    }
@@ -1177,6 +1193,7 @@ impl TenantShard {
            reconciler_config,
            config: self.config.clone(),
            observed: self.observed.clone(),
+            original_observed: self.observed.clone(),
            compute_hook: compute_hook.clone(),
            service_config: service_config.clone(),
            _gate_guard: gate_guard,
@@ -1437,6 +1454,62 @@ impl TenantShard {
            .map(|(node_id, gen)| (node_id, Generation::new(gen)))
            .collect()
    }
+
+    /// Update the observed state of the tenant by applying incremental deltas
+    ///
+    /// Deltas are generated by reconcilers via [`Reconciler::observed_deltas`].
+    /// They are then filtered in [`crate::service::Service::process_result`].
+    pub(crate) fn apply_observed_deltas(
+        &mut self,
+        deltas: impl Iterator<Item = ObservedStateDelta>,
+    ) {
+        for delta in deltas {
+            match delta {
+                ObservedStateDelta::Upsert(ups) => {
+                    let (node_id, loc) = *ups;
+
+                    // If the generation of the observed location in the delta is lagging
+                    // behind the current one, then we have a race condition and cannot
+                    // be certain about the true observed state. Set the observed state
+                    // to None in order to reflect this.
+                    let crnt_gen = self
+                        .observed
+                        .locations
+                        .get(&node_id)
+                        .and_then(|loc| loc.conf.as_ref())
+                        .and_then(|conf| conf.generation);
+                    let new_gen = loc.conf.as_ref().and_then(|conf| conf.generation);
+                    match (crnt_gen, new_gen) {
+                        (Some(crnt), Some(new)) if crnt_gen > new_gen => {
+                            tracing::warn!(
+                                "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})",
+                                node_id, loc, crnt, new
+                            );
+
+                            self.observed
+                                .locations
+                                .insert(node_id, ObservedStateLocation { conf: None });
+
+                            continue;
+                        }
+                        _ => {}
+                    }
+
+                    if let Some(conf) = &loc.conf {
+                        tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                    } else {
+                        tracing::info!("Setting observed location {} to None", node_id,)
+                    }
+
+                    self.observed.locations.insert(node_id, loc);
+                }
+                ObservedStateDelta::Delete(node_id) => {
+                    tracing::info!("Deleting observed location {}", node_id);
+                    self.observed.locations.remove(&node_id);
+                }
+            }
+        }
+    }
 }

 #[cfg(test)]
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -317,9 +317,8 @@ pub async fn scan_pageserver_metadata(
        tenant_timeline_results.push((ttid, data));
    }

-    let tenant_id = tenant_id.expect("Must be set if results are present");
-
    if !tenant_timeline_results.is_empty() {
+        let tenant_id = tenant_id.expect("Must be set if results are present");
        analyze_tenant(
            &remote_client,
            tenant_id,
--- a/Show More
+++ b/Show More