docker compose with compute-node build commands

2026-06-01 12:30:38 +00:00 · 2024-12-24 21:12:25 +00:00
233 changed files with 5348 additions and 11891 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -3,16 +3,6 @@
 # by the RUSTDOCFLAGS env var in CI.
 rustdocflags = ["-Arustdoc::private_intra_doc_links"]

-# Enable frame pointers. This may have a minor performance overhead, but makes it easier and more
-# efficient to obtain stack traces (and thus CPU/heap profiles). It may also avoid seg faults that
-# we've seen with libunwind-based profiling. See also:
-#
-# * <https://www.brendangregg.com/blog/2024-03-17/the-return-of-the-frame-pointers.html>
-# * <https://github.com/rust-lang/rust/pull/122646>
-#
-# NB: the RUSTFLAGS envvar will replace this. Make sure to update e.g. Dockerfile as well.
-rustflags = ["-Cforce-frame-pointers=yes"]
-
 [alias]
 build_testing = ["build", "--features", "testing"]
 neon = ["run", "--bin", "neon_local"]
--- a/.github/file-filters.yaml
+++ b/.github/file-filters.yaml
@@ -1,12 +0,0 @@
-rust_code: ['**/*.rs', '**/Cargo.toml', '**/Cargo.lock']
-
-v14: ['vendor/postgres-v14/**', 'Makefile', 'pgxn/**']
-v15: ['vendor/postgres-v15/**', 'Makefile', 'pgxn/**']
-v16: ['vendor/postgres-v16/**', 'Makefile', 'pgxn/**']
-v17: ['vendor/postgres-v17/**', 'Makefile', 'pgxn/**']
-
-rebuild_neon_extra:
-    - .github/workflows/neon_extra_builds.yml
-
-rebuild_macos:
-    - .github/workflows/build-macos.yml
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -33,7 +33,7 @@ jobs:
          # SC2086 - Double quote to prevent globbing and word splitting. - https://www.shellcheck.net/wiki/SC2086
          SHELLCHECK_OPTS: --exclude=SC2046,SC2086
        with:
-          fail_level: error
+          fail_on_error: true
          filter_mode: nofilter
          level: error

--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -1,241 +0,0 @@
-name: Check neon with MacOS builds
-
-on:
-  workflow_call:
-    inputs:
-      pg_versions:
-        description: "Array of the pg versions to build for, for example: ['v14', 'v17']"
-        type: string
-        default: '[]'
-        required: false
-      rebuild_rust_code:
-        description: "Rebuild Rust code"
-        type: boolean
-        default: false
-        required: false
-      rebuild_everything:
-        description: "If true, rebuild for all versions"
-        type: boolean
-        default: false
-        required: false
-
-env:
-  RUST_BACKTRACE: 1
-  COPT: '-Werror'
-
-# TODO: move `check-*` and `files-changed` jobs to the "Caller" Workflow
-# We should care about that as Github has limitations:
-# - You can connect up to four levels of workflows
-# - You can call a maximum of 20 unique reusable workflows from a single workflow file.
-# https://docs.github.com/en/actions/sharing-automations/reusing-workflows#limitations
-jobs:
-  build-pgxn:
-    if: |
-      (inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-        github.ref_name == 'main'
-      )
-    timeout-minutes: 30
-    runs-on: macos-15
-    strategy:
-      matrix:
-        postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }}
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
-    steps:
-      - name: Checkout main repo
-        uses: actions/checkout@v4
-
-      - name: Set pg ${{ matrix.postgres-version }} for caching
-        id: pg_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-${{ matrix.postgres-version }}) | tee -a "${GITHUB_OUTPUT}"
-
-      - name: Cache postgres ${{ matrix.postgres-version }} build
-        id: cache_pg
-        uses: actions/cache@v4
-        with:
-          path: pg_install/${{ matrix.postgres-version }}
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Checkout submodule vendor/postgres-${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          git submodule init vendor/postgres-${{ matrix.postgres-version }}
-          git submodule update --depth 1 --recursive
-
-      - name: Install build dependencies
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          brew install flex bison openssl protobuf icu4c
-
-      - name: Set extra env for macOS
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
-
-      - name: Build Postgres ${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make postgres-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
-
-      - name: Build Neon Pg Ext ${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)
-
-      - name: Get postgres headers ${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
-
-  build-walproposer-lib:
-    if: |
-      (inputs.pg_versions != '[]' || inputs.rebuild_everything) && (
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-        github.ref_name == 'main'
-      )
-    timeout-minutes: 30
-    runs-on: macos-15
-    needs: [build-pgxn]
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
-    steps:
-      - name: Checkout main repo
-        uses: actions/checkout@v4
-
-      - name: Set pg v17 for caching
-        id: pg_rev
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
-
-      - name: Cache postgres v17 build
-        id: cache_pg
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache walproposer-lib
-        id: cache_walproposer_lib
-        uses: actions/cache@v4
-        with:
-          path: pg_install/build/walproposer-lib
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Checkout submodule vendor/postgres-v17
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run: |
-          git submodule init vendor/postgres-v17
-          git submodule update --depth 1 --recursive
-
-      - name: Install build dependencies
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run: |
-          brew install flex bison openssl protobuf icu4c
-
-      - name: Set extra env for macOS
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
-
-      - name: Build walproposer-lib (only for v17)
-        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
-        run:
-          make walproposer-lib -j$(sysctl -n hw.ncpu)
-
-  cargo-build:
-    if: |
-      (inputs.pg_versions != '[]' || inputs.rebuild_rust_code || inputs.rebuild_everything) && (
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-        github.ref_name == 'main'
-      )
-    timeout-minutes: 30
-    runs-on: macos-15
-    needs: [build-pgxn, build-walproposer-lib]
-    env:
-      # Use release build only, to have less debug info around
-      # Hence keeping target/ (and general cache size) smaller
-      BUILD_TYPE: release
-    steps:
-      - name: Checkout main repo
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-
-      - name: Set pg v14 for caching
-        id: pg_rev_v14
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) | tee -a "${GITHUB_OUTPUT}"
-      - name: Set pg v15 for caching
-        id: pg_rev_v15
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) | tee -a "${GITHUB_OUTPUT}"
-      - name: Set pg v16 for caching
-        id: pg_rev_v16
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) | tee -a "${GITHUB_OUTPUT}"
-      - name: Set pg v17 for caching
-        id: pg_rev_v17
-        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) | tee -a "${GITHUB_OUTPUT}"
-
-      - name: Cache postgres v14 build
-        id: cache_pg
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-      - name: Cache postgres v15 build
-        id: cache_pg_v15
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-      - name: Cache postgres v16 build
-        id: cache_pg_v16
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-      - name: Cache postgres v17 build
-        id: cache_pg_v17
-        uses: actions/cache@v4
-        with:
-          path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Cache cargo deps (only for v17)
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            !~/.cargo/registry/src
-            ~/.cargo/git
-            target
-          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
-
-      - name: Cache walproposer-lib
-        id: cache_walproposer_lib
-        uses: actions/cache@v4
-        with:
-          path: pg_install/build/walproposer-lib
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
-
-      - name: Install build dependencies
-        run: |
-          brew install flex bison openssl protobuf icu4c
-
-      - name: Set extra env for macOS
-        run: |
-          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
-          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
-
-      - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
-
-      - name: Check that no warnings are produced (only for v17)
-        run: ./run_clippy.sh
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -212,7 +212,7 @@ jobs:
          fi
          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
      - name: Run cargo clippy (debug)
-        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS

      - name: Check documentation generation
        run: cargo doc --workspace --no-deps --document-private-items
@@ -538,7 +538,7 @@ jobs:

  trigger-e2e-tests:
    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
-    needs: [ check-permissions, promote-images-dev, tag ]
+    needs: [ check-permissions, promote-images, tag ]
    uses: ./.github/workflows/trigger-e2e-tests.yml
    secrets: inherit

@@ -728,6 +728,30 @@ jobs:
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}

+      - name: Build compute-tools image
+        # compute-tools are Postgres independent, so build it only once
+        # We pick 16, because that builds on debian 11 with older glibc (and is
+        # thus compatible with newer glibc), rather than 17 on Debian 12, as
+        # that isn't guaranteed to be compatible with Debian 11
+        if: matrix.version.pg == 'v16'
+        uses: docker/build-push-action@v6
+        with:
+          target: compute-tools-image
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
+            DEBIAN_VERSION=${{ matrix.version.debian }}
+          provenance: false
+          push: true
+          pull: true
+          file: compute/compute-node.Dockerfile
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
+          tags: |
+            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
+
  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
    permissions:
@@ -770,6 +794,14 @@ jobs:
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

+      - name: Create multi-arch compute-tools image
+        if: matrix.version.pg == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
+                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
+
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
@@ -785,6 +817,12 @@ jobs:
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

+      - name: Push multi-arch compute-tools image to ECR
+        if: matrix.version.pg == 'v16'
+        run: |
+          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
+                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
+
  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, large ]
@@ -892,8 +930,8 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml logs || 0
          docker compose -f ./docker-compose/docker-compose.yml down

-  promote-images-dev:
-    needs: [ check-permissions, tag, vm-compute-node-image ]
+  promote-images:
+    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04

    permissions:
@@ -927,35 +965,6 @@ jobs:
                                               neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
          done

-  promote-images-prod:
-    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
-    runs-on: ubuntu-22.04
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: read
-
-    env:
-      VERSIONS: v14 v15 v16 v17
-
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
      - name: Add latest tag to images
        if: github.ref_name == 'main'
        run: |
@@ -963,6 +972,9 @@ jobs:
            docker buildx imagetools create -t $repo/neon:latest \
                                               $repo/neon:${{ needs.tag.outputs.build-tag }}

+            docker buildx imagetools create -t $repo/compute-tools:latest \
+                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
+
            for version in ${VERSIONS}; do
              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -991,31 +1003,31 @@ jobs:
      - name: Copy all images to prod ECR
        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
        run: |
-          for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
          done

  push-to-acr-dev:
    if: github.ref_name == 'main'
-    needs: [ tag, promote-images-dev ]
+    needs: [ tag, promote-images ]
    uses: ./.github/workflows/_push-to-acr.yml
    with:
      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}

  push-to-acr-prod:
    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-    needs: [ tag, promote-images-prod ]
+    needs: [ tag, promote-images ]
    uses: ./.github/workflows/_push-to-acr.yml
    with:
      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1100,7 +1112,7 @@ jobs:
          exit 1

  deploy:
-    needs: [ check-permissions, promote-images-prod, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
+    needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
    permissions:
@@ -1321,7 +1333,7 @@ jobs:
          done

  pin-build-tools-image:
-    needs: [ build-build-tools-image, promote-images-prod, build-and-test-locally ]
+    needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
    if: github.ref_name == 'main'
    uses: ./.github/workflows/pin-build-tools-image.yml
    with:
@@ -1344,7 +1356,7 @@ jobs:
      - build-and-test-locally
      - check-codestyle-python
      - check-codestyle-rust
-      - promote-images-dev
+      - promote-images
      - test-images
      - trigger-custom-extensions-build-and-wait
    runs-on: ubuntu-22.04
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -31,15 +31,19 @@ jobs:
    uses: ./.github/workflows/build-build-tools-image.yml
    secrets: inherit

-  files-changed:
-    name: Detect what files changed
-    runs-on: ubuntu-22.04
-    timeout-minutes: 3
-    outputs:
-      v17: ${{ steps.files_changed.outputs.v17 }}
-      postgres_changes: ${{ steps.postgres_changes.outputs.changes }}
-      rebuild_rust_code: ${{ steps.files_changed.outputs.rust_code }}
-      rebuild_everything: ${{ steps.files_changed.outputs.rebuild_neon_extra || steps.files_changed.outputs.rebuild_macos }}
+  check-macos-build:
+    needs: [ check-permissions ]
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
+    timeout-minutes: 90
+    runs-on: macos-15
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release

    steps:
      - name: Checkout
@@ -47,45 +51,106 @@ jobs:
        with:
          submodules: true

-      - name: Check for Postgres changes
-        uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242  #v3
-        id: files_changed
+      - name: Install macOS postgres dependencies
+        run: brew install flex bison openssl protobuf icu4c
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set pg 17 revision for caching
+        id: pg_v17_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v4
        with:
-          token: ${{ github.token }}
-          filters: .github/file-filters.yaml
-          base: ${{ github.event_name != 'pull_request' && (github.event.merge_group.base_ref || github.ref_name) || '' }}
-          ref: ${{ github.event_name != 'pull_request' && (github.event.merge_group.head_ref || github.ref) || '' }}
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

-      - name: Filter out only v-string for build matrix
-        id: postgres_changes
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v17 build
+        id: cache_pg_17
+        uses: actions/cache@v4
+        with:
+          path: pg_install/v17
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Set extra env for macOS
        run: |
-          v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
-          echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV

-  check-macos-build:
-    needs: [ check-permissions, files-changed ]
-    if: |
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
-      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-      github.ref_name == 'main'
-    uses: ./.github/workflows/build-macos.yml
-    with:
-      pg_versions: ${{ needs.files-changed.outputs.postgres_changes }}
-      rebuild_rust_code: ${{ needs.files-changed.outputs.rebuild_rust_code }}
-      rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }}
+      - name: Cache cargo deps
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            !~/.cargo/registry/src
+            ~/.cargo/git
+            target
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: make postgres-v14 -j$(sysctl -n hw.ncpu)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: make postgres-v15 -j$(sysctl -n hw.ncpu)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: make postgres-v16 -j$(sysctl -n hw.ncpu)
+
+      - name: Build postgres v17
+        if: steps.cache_pg_17.outputs.cache-hit != 'true'
+        run: make postgres-v17 -j$(sysctl -n hw.ncpu)
+
+      - name: Build neon extensions
+        run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
+
+      - name: Build walproposer-lib
+        run: make walproposer-lib -j$(sysctl -n hw.ncpu)
+
+      - name: Run cargo build
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release
+
+      - name: Check that no warnings are produced
+        run: ./run_clippy.sh

  gather-rust-build-stats:
-    needs: [ check-permissions, build-build-tools-image, files-changed ]
+    needs: [ check-permissions, build-build-tools-image ]
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
      statuses: write
      contents: write
    if: |
-      (needs.files-changed.outputs.v17 == 'true' || needs.files-changed.outputs.rebuild_everything == 'true') && (
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
-        contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
-        github.ref_name == 'main'
-      )
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, large ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -68,7 +68,7 @@ jobs:
      GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      TAG: ${{ needs.tag.outputs.build-tag }}
    steps:
-      - name: Wait for `promote-images-dev` job to finish
+      - name: Wait for `promote-images` job to finish
        # It's important to have a timeout here, the script in the step can run infinitely
        timeout-minutes: 60
        run: |
@@ -79,17 +79,17 @@ jobs:
          # For PRs we use the run id as the tag
          BUILD_AND_TEST_RUN_ID=${TAG}
          while true; do
-            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images-dev") | .conclusion')
+            conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
            case "$conclusion" in
              success)
                break
                ;;
              failure | cancelled | skipped)
-                echo "The 'promote-images-dev' job didn't succeed: '${conclusion}'. Exiting..."
+                echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
                exit 1
                ;;
              *)
-                echo "The 'promote-images-dev' hasn't succeed yet. Waiting..."
+                echo "The 'promote-images' hasn't succeed yet. Waiting..."
                sleep 60
                ;;
            esac
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,5 @@ compaction-suite-results.*

 # pgindent typedef lists
 *.list
+
+venv/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -718,13 +718,13 @@ dependencies = [

 [[package]]
 name = "axum"
-version = "0.7.9"
+version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf"
 dependencies = [
 "async-trait",
 "axum-core",
- "base64 0.22.1",
+ "base64 0.21.1",
 "bytes",
 "futures-util",
 "http 1.1.0",
@@ -746,8 +746,8 @@ dependencies = [
 "sha1",
 "sync_wrapper 1.0.1",
 "tokio",
- "tokio-tungstenite 0.24.0",
- "tower 0.5.2",
+ "tokio-tungstenite",
+ "tower",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -1267,7 +1267,6 @@ dependencies = [
 "aws-config",
 "aws-sdk-kms",
 "aws-sdk-s3",
- "axum",
 "base64 0.13.1",
 "bytes",
 "camino",
@@ -1275,10 +1274,9 @@ dependencies = [
 "chrono",
 "clap",
 "compute_api",
- "fail",
 "flate2",
 "futures",
- "http 1.1.0",
+ "hyper 0.14.30",
 "metrics",
 "nix 0.27.1",
 "notify",
@@ -1304,8 +1302,6 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
- "tower 0.5.2",
- "tower-http",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -1605,32 +1601,6 @@ dependencies = [
 "typenum",
 ]

-[[package]]
-name = "curve25519-dalek"
-version = "4.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be"
-dependencies = [
- "cfg-if",
- "cpufeatures",
- "curve25519-dalek-derive",
- "digest",
- "fiat-crypto",
- "rustc_version",
- "subtle",
-]
-
-[[package]]
-name = "curve25519-dalek-derive"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.90",
-]
-
 [[package]]
 name = "darling"
 version = "0.20.1"
@@ -1679,20 +1649,6 @@ dependencies = [
 "parking_lot_core 0.9.8",
 ]

-[[package]]
-name = "dashmap"
-version = "6.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
-dependencies = [
- "cfg-if",
- "crossbeam-utils",
- "hashbrown 0.14.5",
- "lock_api",
- "once_cell",
- "parking_lot_core 0.9.8",
-]
-
 [[package]]
 name = "data-encoding"
 version = "2.4.0"
@@ -1776,9 +1732,9 @@ checksum = "ab03c107fafeb3ee9f5925686dbb7a73bc76e3932abb0d2b365cb64b169cf04c"

 [[package]]
 name = "diesel"
-version = "2.2.6"
+version = "2.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccf1bedf64cdb9643204a36dd15b19a6ce8e7aa7f7b105868e9f1fad5ffa7d12"
+checksum = "65e13bab2796f412722112327f3e575601a3e9cdcbe426f0d30dbf43f3f5dc71"
 dependencies = [
 "bitflags 2.4.1",
 "byteorder",
@@ -1901,28 +1857,6 @@ dependencies = [
 "spki 0.7.3",
 ]

-[[package]]
-name = "ed25519"
-version = "2.2.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
-dependencies = [
- "signature 2.2.0",
-]
-
-[[package]]
-name = "ed25519-dalek"
-version = "2.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a3daa8e81a3963a60642bcc1f90a670680bd4a77535faa384e9d1c79d620871"
-dependencies = [
- "curve25519-dalek",
- "ed25519",
- "rand_core 0.6.4",
- "sha2",
- "subtle",
-]
-
 [[package]]
 name = "either"
 version = "1.8.1"
@@ -2014,15 +1948,6 @@ dependencies = [
 "syn 2.0.90",
 ]

-[[package]]
-name = "env_filter"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
-dependencies = [
- "log",
-]
-
 [[package]]
 name = "env_logger"
 version = "0.10.2"
@@ -2036,16 +1961,6 @@ dependencies = [
 "termcolor",
 ]

-[[package]]
-name = "env_logger"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d"
-dependencies = [
- "env_filter",
- "log",
-]
-
 [[package]]
 name = "equator"
 version = "0.2.2"
@@ -2161,12 +2076,6 @@ dependencies = [
 "subtle",
 ]

-[[package]]
-name = "fiat-crypto"
-version = "0.2.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
-
 [[package]]
 name = "filetime"
 version = "0.2.22"
@@ -2810,7 +2719,7 @@ dependencies = [
 "pin-project-lite",
 "socket2",
 "tokio",
- "tower 0.4.13",
+ "tower",
 "tower-service",
 "tracing",
 ]
@@ -3035,28 +2944,6 @@ dependencies = [
 "str_stack",
 ]

-[[package]]
-name = "inferno"
-version = "0.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75a5d75fee4d36809e6b021e4b96b686e763d365ffdb03af2bd00786353f84fe"
-dependencies = [
- "ahash",
- "clap",
- "crossbeam-channel",
- "crossbeam-utils",
- "dashmap 6.1.0",
- "env_logger 0.11.2",
- "indexmap 2.0.1",
- "itoa",
- "log",
- "num-format",
- "once_cell",
- "quick-xml 0.37.1",
- "rgb",
- "str_stack",
-]
-
 [[package]]
 name = "inotify"
 version = "0.9.6"
@@ -3264,7 +3151,7 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
 dependencies = [
- "dashmap 5.5.0",
+ "dashmap",
 "hashbrown 0.13.2",
 ]

@@ -3372,9 +3259,9 @@ checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"

 [[package]]
 name = "matchit"
-version = "0.8.4"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
+checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed"

 [[package]]
 name = "md-5"
@@ -3802,23 +3689,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "opentelemetry"
-version = "0.27.1"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
+checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
 dependencies = [
 "futures-core",
 "futures-sink",
 "js-sys",
+ "once_cell",
 "pin-project-lite",
 "thiserror",
- "tracing",
 ]

 [[package]]
 name = "opentelemetry-http"
-version = "0.27.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
+checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
 dependencies = [
 "async-trait",
 "bytes",
@@ -3829,9 +3716,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.27.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
+checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
 dependencies = [
 "async-trait",
 "futures-core",
@@ -3847,9 +3734,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-proto"
-version = "0.27.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
+checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
 dependencies = [
 "opentelemetry",
 "opentelemetry_sdk",
@@ -3859,21 +3746,22 @@ dependencies = [

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.27.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"
+checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.27.1"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
+checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
 dependencies = [
 "async-trait",
 "futures-channel",
 "futures-executor",
 "futures-util",
 "glob",
+ "once_cell",
 "opentelemetry",
 "percent-encoding",
 "rand 0.8.5",
@@ -3881,7 +3769,6 @@ dependencies = [
 "thiserror",
 "tokio",
 "tokio-stream",
- "tracing",
 ]

 [[package]]
@@ -4044,7 +3931,6 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "postgres_initdb",
- "pprof",
 "pq_proto",
 "procfs",
 "rand 0.8.5",
@@ -4531,7 +4417,7 @@ dependencies = [
 "bytes",
 "crc32c",
 "criterion",
- "env_logger 0.10.2",
+ "env_logger",
 "log",
 "memoffset 0.9.0",
 "once_cell",
@@ -4572,7 +4458,7 @@ dependencies = [
 "cfg-if",
 "criterion",
 "findshlibs",
- "inferno 0.11.21",
+ "inferno",
 "libc",
 "log",
 "nix 0.26.4",
@@ -4607,9 +4493,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"

 [[package]]
 name = "pq-sys"
-version = "0.6.3"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6cc05d7ea95200187117196eee9edd0644424911821aeb28a18ce60ea0b8793"
+checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd"
 dependencies = [
 "vcpkg",
 ]
@@ -4798,10 +4684,9 @@ dependencies = [
 "clap",
 "compute_api",
 "consumption_metrics",
- "dashmap 5.5.0",
+ "dashmap",
 "ecdsa 0.16.9",
- "ed25519-dalek",
- "env_logger 0.10.2",
+ "env_logger",
 "fallible-iterator",
 "flate2",
 "framed-websockets",
@@ -4872,7 +4757,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres2",
 "tokio-rustls 0.26.0",
- "tokio-tungstenite 0.21.0",
+ "tokio-tungstenite",
 "tokio-util",
 "tracing",
 "tracing-subscriber",
@@ -4908,15 +4793,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "quick-xml"
-version = "0.37.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f22f29bdff3987b4d8632ef95fd6424ec7e4e0a57e2f4fc63e489e75357f6a03"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "quote"
 version = "1.0.37"
@@ -5301,15 +5177,15 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.5.5"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
+checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
 dependencies = [
 "anyhow",
 "async-trait",
 "getrandom 0.2.11",
 "http 1.1.0",
- "matchit 0.8.4",
+ "matchit 0.8.2",
 "opentelemetry",
 "reqwest",
 "reqwest-middleware",
@@ -6923,19 +6799,7 @@ dependencies = [
 "futures-util",
 "log",
 "tokio",
- "tungstenite 0.21.0",
-]
-
-[[package]]
-name = "tokio-tungstenite"
-version = "0.24.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edc5f74e248dc973e0dbb7b74c7e0d6fcc301c694ff50049504004ef4d0cdcd9"
-dependencies = [
- "futures-util",
- "log",
- "tokio",
- "tungstenite 0.24.0",
+ "tungstenite",
 ]

 [[package]]
@@ -7016,7 +6880,7 @@ dependencies = [
 "tokio",
 "tokio-rustls 0.26.0",
 "tokio-stream",
- "tower 0.4.13",
+ "tower",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7056,50 +6920,17 @@ dependencies = [
 "tracing",
 ]

-[[package]]
-name = "tower"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
-dependencies = [
- "futures-core",
- "futures-util",
- "pin-project-lite",
- "sync_wrapper 1.0.1",
- "tokio",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
-[[package]]
-name = "tower-http"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
-dependencies = [
- "bitflags 2.4.1",
- "bytes",
- "http 1.1.0",
- "http-body 1.0.0",
- "pin-project-lite",
- "tower-layer",
- "tower-service",
- "tracing",
- "uuid",
-]
-
 [[package]]
 name = "tower-layer"
-version = "0.3.3"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"

 [[package]]
 name = "tower-service"
-version = "0.3.3"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"

 [[package]]
 name = "tracing"
@@ -7168,9 +6999,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.28.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
+checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
 dependencies = [
 "js-sys",
 "once_cell",
@@ -7254,24 +7085,6 @@ dependencies = [
 "utf-8",
 ]

-[[package]]
-name = "tungstenite"
-version = "0.24.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18e5b8366ee7a95b16d32197d0b2604b43a0be89dc5fac9f8e96ccafbaedda8a"
-dependencies = [
- "byteorder",
- "bytes",
- "data-encoding",
- "http 1.1.0",
- "httparse",
- "log",
- "rand 0.8.5",
- "sha1",
- "thiserror",
- "utf-8",
-]
-
 [[package]]
 name = "twox-hash"
 version = "1.6.3"
@@ -7439,7 +7252,6 @@ dependencies = [
 "hex-literal",
 "humantime",
 "hyper 0.14.30",
- "inferno 0.12.0",
 "itertools 0.10.5",
 "jemalloc_pprof",
 "jsonwebtoken",
@@ -7543,7 +7355,7 @@ dependencies = [
 "anyhow",
 "camino-tempfile",
 "clap",
- "env_logger 0.10.2",
+ "env_logger",
 "log",
 "postgres",
 "postgres_ffi",
@@ -8054,8 +7866,7 @@ dependencies = [
 "tokio-util",
 "toml_edit",
 "tonic",
- "tower 0.4.13",
- "tower 0.5.2",
+ "tower",
 "tracing",
 "tracing-core",
 "url",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
-axum = { version = "0.7.9", features = ["ws"] }
+axum = { version = "0.7.5", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.70"
@@ -110,7 +110,6 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
-inferno = "0.12.0"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
@@ -127,16 +126,16 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.27"
-opentelemetry_sdk = "0.27"
-opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.27"
+opentelemetry = "0.26"
+opentelemetry_sdk = "0.26"
+opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.26"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
-pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
+pprof = { version = "0.14", features = ["criterion", "flamegraph", "protobuf", "protobuf-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13"
@@ -144,7 +143,7 @@ rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
 reqwest-middleware = "0.4"
 reqwest-retry = "0.7"
 routerify = "3"
@@ -188,12 +187,10 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
 tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
-tower = { version = "0.5.2", default-features = false }
-tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
-tower-service = "0.3.3"
+tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2"
-tracing-opentelemetry = "0.28"
+tracing-opentelemetry = "0.27"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
@@ -269,8 +266,6 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", br
 [profile.release]
 # This is useful for profiling and, to some extent, debug.
 # Besides, debug info should not affect the performance.
-#
-# NB: we also enable frame pointers for improved profiling, see .cargo/config.toml.
 debug = true

 # disable debug symbols for all packages except this one to decrease binaries size
--- a/10
+++ b/10
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .

 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
@@ -69,9 +69,6 @@ RUN set -e \
        libreadline-dev \
        libseccomp-dev \
        ca-certificates \
-	# System postgres for use with client libraries (e.g. in storage controller)
-        postgresql-15 \
-        openssl \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && useradd -d /data neon \
    && chown -R neon:neon /data
@@ -104,6 +101,11 @@ RUN mkdir -p /data/.neon/ && \
  > /data/.neon/pageserver.toml && \
  chown -R neon:neon /data/.neon

+# When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
+# that want a particular postgres version will select it explicitly: this is just a default.
+ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
+
+
 VOLUME ["/data"]
 USER neon
 EXPOSE 6400
--- a/3
+++ b/3
@@ -3,6 +3,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

+OPENSSL_PREFIX_DIR := /usr/local/openssl
 ICU_PREFIX_DIR := /usr/local/icu

 #
@@ -25,9 +26,11 @@ endif
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
+	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
 	PG_CONFIGURE_OPTS += --with-icu
 	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
 	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
+	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
 endif

 UNAME_S := $(shell uname -s)
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \

 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.17.0
+ENV SQL_EXPORTER_VERSION=0.16.0
 RUN curl -fsSL \
    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
    --output sql_exporter.tar.gz \
@@ -190,6 +190,21 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
    && make install \
    && rm -rf ../lcov.tar.gz

+# Compile and install the static OpenSSL library
+ENV OPENSSL_VERSION=1.1.1w
+ENV OPENSSL_PREFIX=/usr/local/openssl
+RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
+    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
+    cd /tmp && \
+    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    cd /tmp/openssl-${OPENSSL_VERSION} && \
+    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
+    make -j "$(nproc)" && \
+    make install && \
+    cd /tmp && \
+    rm -rf /tmp/openssl-${OPENSSL_VERSION}
+
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #
@@ -243,7 +258,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.0
+ENV RUSTC_VERSION=1.83.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -104,18 +104,16 @@ RUN cd postgres && \
        esac; \
    done;

-# Set PATH for all the subsequent build steps
-ENV PATH="/usr/local/pgsql/bin:$PATH"
-
 #########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM pg-build AS postgis-build
+FROM build-deps AS postgis-build
 ARG DEBIAN_VERSION
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y \
    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -153,6 +151,8 @@ RUN case "${DEBIAN_VERSION}" in \
    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
    ninja clean && cp -R /sfcgal/* /

+ENV PATH="/usr/local/pgsql/bin:$PATH"
+
 # Postgis 3.5.0 supports v17
 RUN case "${PG_VERSION}" in \
    "v17") \
@@ -170,6 +170,7 @@ RUN case "${PG_VERSION}" in \
    wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
    echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -219,7 +220,11 @@ RUN case "${PG_VERSION}" in \
    cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
    ninja -j $(getconf _NPROCESSORS_ONLN) && \
    ninja -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -

 #########################################################################################
 #
@@ -227,8 +232,9 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM pg-build AS plv8-build
+FROM build-deps AS plv8-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch

@@ -263,6 +269,7 @@ RUN case "${PG_VERSION}" in \
    # generate and copy upgrade scripts
    mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
    cp upgrade/* /usr/local/pgsql/share/extension/ && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
@@ -289,8 +296,9 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM pg-build AS h3-pg-build
+FROM build-deps AS h3-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
@@ -311,6 +319,7 @@ RUN mkdir -p /h3/usr/ && \
 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
+    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -322,16 +331,17 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 # compile unit extension
 #
 #########################################################################################
-FROM pg-build AS unit-pg-build
+FROM build-deps AS unit-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release 7.9 - Sep 15, 2024
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
    echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
    # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
    # This one-liner removes pgsql/ part of the path.
@@ -345,8 +355,9 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 # compile pgvector extension
 #
 #########################################################################################
-FROM pg-build AS vector-pg-build
+FROM build-deps AS vector-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY compute/patches/pgvector.patch /pgvector.patch

@@ -360,8 +371,8 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
    echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

 #########################################################################################
@@ -370,15 +381,16 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
 # compile pgjwt extension
 #
 #########################################################################################
-FROM pg-build AS pgjwt-pg-build
+FROM build-deps AS pgjwt-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
    echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control

 #########################################################################################
@@ -387,16 +399,17 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
 # compile hypopg extension
 #
 #########################################################################################
-FROM pg-build AS hypopg-pg-build
+FROM build-deps AS hypopg-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
    echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control

 #########################################################################################
@@ -405,16 +418,17 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM pg-build AS pg-hashids-pg-build
+FROM build-deps AS pg-hashids-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control

 #########################################################################################
@@ -423,8 +437,9 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 # compile rum extension
 #
 #########################################################################################
-FROM pg-build AS rum-pg-build
+FROM build-deps AS rum-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY compute/patches/rum.patch /rum.patch

@@ -435,8 +450,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
    echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
    patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control

 #########################################################################################
@@ -445,16 +460,17 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
 # compile pgTAP extension
 #
 #########################################################################################
-FROM pg-build AS pgtap-pg-build
+FROM build-deps AS pgtap-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
    echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control

 #########################################################################################
@@ -463,16 +479,17 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
 # compile ip4r extension
 #
 #########################################################################################
-FROM pg-build AS ip4r-pg-build
+FROM build-deps AS ip4r-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control

 #########################################################################################
@@ -481,16 +498,17 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 # compile Prefix extension
 #
 #########################################################################################
-FROM pg-build AS prefix-pg-build
+FROM build-deps AS prefix-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control

 #########################################################################################
@@ -499,16 +517,17 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 # compile hll extension
 #
 #########################################################################################
-FROM pg-build AS hll-pg-build
+FROM build-deps AS hll-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v2.18 - Aug 29, 2023
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control

 #########################################################################################
@@ -517,16 +536,17 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM pg-build AS plpgsql-check-pg-build
+FROM build-deps AS plpgsql-check-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
    echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control

 #########################################################################################
@@ -535,8 +555,11 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
 # compile timescaledb extension
 #
 #########################################################################################
-FROM pg-build AS timescaledb-pg-build
+FROM build-deps AS timescaledb-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
 ARG PG_VERSION
+ENV PATH="/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
@@ -567,8 +590,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM pg-build AS pg-hint-plan-pg-build
+FROM build-deps AS pg-hint-plan-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
 ARG PG_VERSION
+ENV PATH="/usr/local/pgsql/bin:$PATH"

 # version-specific, has separate releases for each version
 RUN case "${PG_VERSION}" in \
@@ -606,12 +632,14 @@ RUN case "${PG_VERSION}" in \
 # compile pg_cron extension
 #
 #########################################################################################
-FROM pg-build AS pg-cron-pg-build
+FROM build-deps AS pg-cron-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
    echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -625,8 +653,9 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
 # compile rdkit extension
 #
 #########################################################################################
-FROM pg-build AS rdkit-pg-build
+FROM build-deps AS rdkit-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y \
@@ -644,13 +673,7 @@ RUN apt update && \
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
-
-# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
-# pg_config. For some reason the rdkit cmake script doesn't work with just that,
-# however. By also adding /usr/local/pgsql, it works, which is weird because there
-# are no executables in that directory.
-ENV PATH="/usr/local/pgsql:$PATH"
-
+ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN case "${PG_VERSION}" in \
    "v17") \
        export RDKIT_VERSION=Release_2024_09_1 \
@@ -703,11 +726,13 @@ RUN case "${PG_VERSION}" in \
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM pg-build AS pg-uuidv7-pg-build
+FROM build-deps AS pg-uuidv7-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -721,11 +746,13 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM pg-build AS pg-roaringbitmap-pg-build
+FROM build-deps AS pg-roaringbitmap-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -739,14 +766,16 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 # compile pg_semver extension
 #
 #########################################################################################
-FROM pg-build AS pg-semver-pg-build
+FROM build-deps AS pg-semver-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # Release 0.40.0 breaks backward compatibility with previous versions
 # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
    "v17") \
        export SEMVER_VERSION=0.40.0 \
@@ -773,11 +802,13 @@ RUN case "${PG_VERSION}" in \
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM pg-build AS pg-embedding-pg-build
+FROM build-deps AS pg-embedding-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
 ARG PG_VERSION
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
        export PG_EMBEDDING_VERSION=0.3.5 \
@@ -798,19 +829,26 @@ RUN case "${PG_VERSION}" in \
 # compile anon extension
 #
 #########################################################################################
-FROM pg-build AS pg-anon-pg-build
+FROM build-deps AS pg-anon-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
    esac && \
    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -

 #########################################################################################
 #
@@ -818,8 +856,9 @@ RUN case "${PG_VERSION}" in "v17") \
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build
+FROM build-deps AS rust-extensions-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -827,7 +866,7 @@ RUN apt update && \
    useradd -ms /bin/bash nonroot -b /home

 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot

@@ -854,8 +893,9 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build AS rust-extensions-build-pgrx12
+FROM build-deps AS rust-extensions-build-pgrx12
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -863,7 +903,7 @@ RUN apt update && \
    useradd -ms /bin/bash nonroot -b /home

 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot

@@ -936,9 +976,22 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p

 FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
 ARG PG_VERSION
+# version 0.3.3 supports v17
 # last release v0.3.3 - Oct 16, 2024
-RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
+#
+# there were no breaking changes
+# so we can use the same version for all postgres versions
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+        export PG_JSONSCHEMA_VERSION=0.3.3 \
+        export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
+    esac && \
+    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
    # `unsafe-postgres` feature allows to build pgx extensions
@@ -959,9 +1012,22 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
 FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
 ARG PG_VERSION

+# version 1.5.9 supports v17
 # last release v1.5.9 - Oct 16, 2024
-RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
-    echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
+#
+# there were no breaking changes
+# so we can use the same version for all postgres versions
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+        export PG_GRAPHQL_VERSION=1.5.9 \
+        export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
+    esac && \
+    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
+    echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
@@ -1025,8 +1091,8 @@ ARG PG_VERSION
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release
@@ -1038,11 +1104,13 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0
 #
 #########################################################################################

-FROM pg-build AS wal2json-pg-build
+FROM build-deps AS wal2json-pg-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
    echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
@@ -1055,11 +1123,13 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM pg-build AS pg-ivm-build
+FROM build-deps AS pg-ivm-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
    echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -1073,11 +1143,13 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM pg-build AS pg-partman-build
+FROM build-deps AS pg-partman-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
    echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1093,12 +1165,24 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 #########################################################################################
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
-    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
-    mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
-    make release -j $(getconf _NPROCESSORS_ONLN) && \
-    make install -j $(getconf _NPROCESSORS_ONLN) && \
+# The topmost commit in the `neon` branch at the time of writing this
+# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
+ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
+
+RUN case "${PG_VERSION}" in \
+        'v14') \
+            echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
+    esac && \
+    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
+    cd pg_mooncake-src && \
+    git checkout "${PG_MOONCAKE_VERSION}" && \
+    git submodule update --init --depth 1 --recursive && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control

 #########################################################################################
@@ -1108,8 +1192,11 @@ RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/p
 #
 #########################################################################################

-FROM pg-build AS pg-repack-build
+FROM build-deps AS pg-repack-build
 ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH="/usr/local/pgsql/bin/:$PATH"

 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
    echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
@@ -1180,11 +1267,25 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_rmgr \
+        -s install && \
+    case "${PG_VERSION}" in \
+        "v14" | "v15") \
+        ;; \
+        "v16" | "v17") \
+            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
+        ;; \
+        *) \
+            echo "unexpected PostgreSQL version" && exit 1 \
+        ;; \
+        esac && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/hnsw \
        -s install

 #########################################################################################
 #
-# Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries
+# Compile and run the Neon-specific `compute_ctl` and `fast_import` binaries
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
@@ -1194,7 +1295,18 @@ ENV BUILD_TAG=$BUILD_TAG
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
-RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy
+RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto
+
+#########################################################################################
+#
+# Final compute-tools image
+#
+#########################################################################################
+
+FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
+
+COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import

 #########################################################################################
 #
@@ -1226,17 +1338,31 @@ RUN set -e \
    && make -j $(nproc) dist_man_MANS= \
    && make install dist_man_MANS=

+#########################################################################################
+#
+# Compile the Neon-specific `local_proxy` binary
+#
+#########################################################################################
+FROM $REPOSITORY/$IMAGE:$TAG AS local_proxy
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG
+
+USER nonroot
+# Copy entire project to get Cargo.* files with proper dependencies for the whole project
+COPY --chown=nonroot . .
+RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin local_proxy
+
 #########################################################################################
 #
 # Layers "postgres-exporter" and "sql-exporter"
 #
 #########################################################################################

-FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter

 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter

 #########################################################################################
 #
@@ -1365,7 +1491,7 @@ COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/
 COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini

 # local_proxy and its config
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
+COPY --from=local_proxy --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
 RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy

 # Metrics exporter binaries and  configuration files
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -7,7 +7,7 @@ license.workspace = true
 [features]
 default = []
 # Enables test specific features.
-testing = ["fail/failpoints"]
+testing = []

 [dependencies]
 base64.workspace = true
@@ -15,15 +15,13 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-sdk-kms.workspace = true
 anyhow.workspace = true
-axum = { workspace = true, features = [] }
 camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
 clap.workspace = true
-fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
-http.workspace = true
+hyper0 = { workspace = true, features = ["full"] }
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
@@ -38,8 +36,6 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
-tower.workspace = true
-tower-http.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -60,22 +60,19 @@ use compute_tools::compute::{
 };
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::launch_http_server;
+use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
 use compute_tools::swap::resize_swap;
 use rlimit::{setrlimit, Resource};
-use utils::failpoint_support;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
-    let scenario = failpoint_support::init();
-
    let (build_tag, clap_args) = init()?;

    // enable core dumping for all child processes
@@ -103,14 +100,17 @@ fn main() -> Result<()> {

    maybe_delay_exit(delay_exit);

-    scenario.teardown();
-
    deinit_and_exit(wait_pg_result);
 }

 fn init() -> Result<(String, clap::ArgMatches)> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

+    opentelemetry::global::set_error_handler(|err| {
+        tracing::info!("OpenTelemetry error: {err}");
+    })
+    .expect("global error handler lock poisoned");
+
    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
    thread::spawn(move || {
        for sig in signals.forever() {
@@ -419,14 +419,9 @@ fn start_postgres(
        "running compute with features: {:?}",
        state.pspec.as_ref().unwrap().spec.features
    );
-    // before we release the mutex, fetch some parameters for later.
-    let &ComputeSpec {
-        swap_size_bytes,
-        disk_quota_bytes,
-        #[cfg(target_os = "linux")]
-        disable_lfc_resizing,
-        ..
-    } = &state.pspec.as_ref().unwrap().spec;
+    // before we release the mutex, fetch the swap size (if any) for later.
+    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
+    let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
    drop(state);

    // Launch remaining service threads
@@ -488,10 +483,7 @@ fn start_postgres(
    let mut pg = None;
    if !prestartup_failed {
        pg = match compute.start_compute() {
-            Ok(pg) => {
-                info!(postmaster_pid = %pg.0.id(), "Postgres was started");
-                Some(pg)
-            }
+            Ok(pg) => Some(pg),
            Err(err) => {
                error!("could not start the compute node: {:#}", err);
                compute.set_failed_status(err);
@@ -534,18 +526,11 @@ fn start_postgres(
            // This token is used internally by the monitor to clean up all threads
            let token = CancellationToken::new();

-            // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
-            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
-                None
-            } else {
-                file_cache_connstr.cloned()
-            };
-
            let vm_monitor = rt.as_ref().map(|rt| {
                rt.spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
-                        pgconnstr,
+                        pgconnstr: file_cache_connstr.cloned(),
                        addr: vm_monitor_addr.clone(),
                    })),
                    token.clone(),
@@ -589,8 +574,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
    // propagate to Postgres and it will be shut down as well.
    let mut exit_code = None;
    if let Some((mut pg, logs_handle)) = pg {
-        info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
-
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -17,7 +17,7 @@
 //!
 //! # Local Testing
 //!
-//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
+//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
 //! - Build the image with the following command:
 //!
 //! ```bash
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat

 #[derive(Debug, thiserror::Error)]
 pub enum SchemaDumpError {
-    #[error("database does not exist")]
+    #[error("Database does not exist.")]
    DatabaseDoesNotExist,
-    #[error("failed to execute pg_dump")]
+    #[error("Failed to execute pg_dump.")]
    IO(#[from] std::io::Error),
-    #[error("unexpected I/O error")]
+    #[error("Unexpected error.")]
    Unexpected,
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -15,7 +15,7 @@ use std::time::Instant;

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use compute_api::spec::{Database, PgIdent, Role};
+use compute_api::spec::{PgIdent, Role};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -45,10 +45,8 @@ use crate::spec_apply::ApplySpecPhase::{
    DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
    RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
 };
-use crate::spec_apply::PerDatabasePhase;
 use crate::spec_apply::PerDatabasePhase::{
-    ChangeSchemaPerms, DeleteDBRoleReferences, DropSubscriptionsForDeletedDatabases,
-    HandleAnonExtension,
+    ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension,
 };
 use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -836,7 +834,7 @@ impl ComputeNode {
        conf
    }

-    pub async fn get_maintenance_client(
+    async fn get_maintenance_client(
        conf: &tokio_postgres::Config,
    ) -> Result<tokio_postgres::Client> {
        let mut conf = conf.clone();
@@ -945,78 +943,6 @@ impl ComputeNode {
                dbs: databases,
            }));

-            // Apply special pre drop database phase.
-            // NOTE: we use the code of RunInEachDatabase phase for parallelism
-            // and connection management, but we don't really run it in *each* database,
-            // only in databases, we're about to drop.
-            info!("Applying PerDatabase (pre-dropdb) phase");
-            let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
-
-            // Run the phase for each database that we're about to drop.
-            let db_processes = spec
-                .delta_operations
-                .iter()
-                .flatten()
-                .filter_map(move |op| {
-                    if op.action.as_str() == "delete_db" {
-                        Some(op.name.clone())
-                    } else {
-                        None
-                    }
-                })
-                .map(|dbname| {
-                    let spec = spec.clone();
-                    let ctx = ctx.clone();
-                    let jwks_roles = jwks_roles.clone();
-                    let mut conf = conf.as_ref().clone();
-                    let concurrency_token = concurrency_token.clone();
-                    // We only need dbname field for this phase, so set other fields to dummy values
-                    let db = DB::UserDB(Database {
-                        name: dbname.clone(),
-                        owner: "cloud_admin".to_string(),
-                        options: None,
-                        restrict_conn: false,
-                        invalid: false,
-                    });
-
-                    debug!("Applying per-database phases for Database {:?}", &db);
-
-                    match &db {
-                        DB::SystemDB => {}
-                        DB::UserDB(db) => {
-                            conf.dbname(db.name.as_str());
-                        }
-                    }
-
-                    let conf = Arc::new(conf);
-                    let fut = Self::apply_spec_sql_db(
-                        spec.clone(),
-                        conf,
-                        ctx.clone(),
-                        jwks_roles.clone(),
-                        concurrency_token.clone(),
-                        db,
-                        [DropSubscriptionsForDeletedDatabases].to_vec(),
-                    );
-
-                    Ok(spawn(fut))
-                })
-                .collect::<Vec<Result<_, anyhow::Error>>>();
-
-            for process in db_processes.into_iter() {
-                let handle = process?;
-                if let Err(e) = handle.await? {
-                    // Handle the error case where the database does not exist
-                    // We do not check whether the DB exists or not in the deletion phase,
-                    // so we shouldn't be strict about it in pre-deletion cleanup as well.
-                    if e.to_string().contains("does not exist") {
-                        warn!("Error dropping subscription: {}", e);
-                    } else {
-                        return Err(e);
-                    }
-                };
-            }
-
            for phase in [
                CreateSuperUser,
                DropInvalidDatabases,
@@ -1036,7 +962,7 @@ impl ComputeNode {
                .await?;
            }

-            info!("Applying RunInEachDatabase2 phase");
+            info!("Applying RunInEachDatabase phase");
            let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));

            let db_processes = spec
@@ -1071,12 +997,6 @@ impl ComputeNode {
                        jwks_roles.clone(),
                        concurrency_token.clone(),
                        db,
-                        [
-                            DeleteDBRoleReferences,
-                            ChangeSchemaPerms,
-                            HandleAnonExtension,
-                        ]
-                        .to_vec(),
                    );

                    Ok(spawn(fut))
@@ -1123,13 +1043,16 @@ impl ComputeNode {
        jwks_roles: Arc<HashSet<String>>,
        concurrency_token: Arc<tokio::sync::Semaphore>,
        db: DB,
-        subphases: Vec<PerDatabasePhase>,
    ) -> Result<()> {
        let _permit = concurrency_token.acquire().await?;

        let mut client_conn = None;

-        for subphase in subphases {
+        for subphase in [
+            DeleteDBRoleReferences,
+            ChangeSchemaPerms,
+            HandleAnonExtension,
+        ] {
            apply_operations(
                spec.clone(),
                ctx.clone(),
@@ -1258,19 +1181,8 @@ impl ComputeNode {
            let mut conf = postgres::config::Config::from(conf);
            conf.application_name("compute_ctl:migrations");

-            match conf.connect(NoTls) {
-                Ok(mut client) => {
-                    if let Err(e) = handle_migrations(&mut client) {
-                        error!("Failed to run migrations: {}", e);
-                    }
-                }
-                Err(e) => {
-                    error!(
-                        "Failed to connect to the compute for running migrations: {}",
-                        e
-                    );
-                }
-            };
+            let mut client = conf.connect(NoTls)?;
+            handle_migrations(&mut client).context("apply_config handle_migrations")
        });

        Ok::<(), anyhow::Error>(())
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -0,0 +1,591 @@
+use std::convert::Infallible;
+use std::net::IpAddr;
+use std::net::Ipv6Addr;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::thread;
+
+use crate::catalog::SchemaDumpError;
+use crate::catalog::{get_database_schema, get_dbs_and_roles};
+use crate::compute::forward_termination_signal;
+use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
+use crate::installed_extensions;
+use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
+use compute_api::responses::{
+    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
+    SetRoleGrantsResponse,
+};
+
+use anyhow::Result;
+use hyper::header::CONTENT_TYPE;
+use hyper::service::{make_service_fn, service_fn};
+use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use metrics::proto::MetricFamily;
+use metrics::Encoder;
+use metrics::TextEncoder;
+use tokio::task;
+use tracing::{debug, error, info, warn};
+use tracing_utils::http::OtelName;
+use utils::http::request::must_get_query_param;
+
+fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
+    ComputeStatusResponse {
+        start_time: state.start_time,
+        tenant: state
+            .pspec
+            .as_ref()
+            .map(|pspec| pspec.tenant_id.to_string()),
+        timeline: state
+            .pspec
+            .as_ref()
+            .map(|pspec| pspec.timeline_id.to_string()),
+        status: state.status,
+        last_active: state.last_active,
+        error: state.error.clone(),
+    }
+}
+
+// Service function to handle all available routes.
+async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
+    //
+    // NOTE: The URI path is currently included in traces. That's OK because
+    // it doesn't contain any variable parts or sensitive information. But
+    // please keep that in mind if you change the routing here.
+    //
+    match (req.method(), req.uri().path()) {
+        // Serialized compute state.
+        (&Method::GET, "/status") => {
+            debug!("serving /status GET request");
+            let state = compute.state.lock().unwrap();
+            let status_response = status_response_from_state(&state);
+            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
+        }
+
+        // Startup metrics in JSON format. Keep /metrics reserved for a possible
+        // future use for Prometheus metrics format.
+        (&Method::GET, "/metrics.json") => {
+            info!("serving /metrics.json GET request");
+            let metrics = compute.state.lock().unwrap().metrics.clone();
+            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
+        }
+
+        // Prometheus metrics
+        (&Method::GET, "/metrics") => {
+            debug!("serving /metrics GET request");
+
+            // When we call TextEncoder::encode() below, it will immediately
+            // return an error if a metric family has no metrics, so we need to
+            // preemptively filter out metric families with no metrics.
+            let metrics = installed_extensions::collect()
+                .into_iter()
+                .filter(|m| !m.get_metric().is_empty())
+                .collect::<Vec<MetricFamily>>();
+
+            let encoder = TextEncoder::new();
+            let mut buffer = vec![];
+
+            if let Err(err) = encoder.encode(&metrics, &mut buffer) {
+                let msg = format!("error handling /metrics request: {err}");
+                error!(msg);
+                return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
+            }
+
+            match Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, encoder.format_type())
+                .body(Body::from(buffer))
+            {
+                Ok(response) => response,
+                Err(err) => {
+                    let msg = format!("error handling /metrics request: {err}");
+                    error!(msg);
+                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+        // Collect Postgres current usage insights
+        (&Method::GET, "/insights") => {
+            info!("serving /insights GET request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!("compute is not running, current status: {:?}", status);
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
+            let insights = compute.collect_insights().await;
+            Response::new(Body::from(insights))
+        }
+
+        (&Method::POST, "/check_writability") => {
+            info!("serving /check_writability POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for check_writability request: {:?}",
+                    status
+                );
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
+            let res = crate::checker::check_writability(compute).await;
+            match res {
+                Ok(_) => Response::new(Body::from("true")),
+                Err(e) => {
+                    error!("check_writability failed: {}", e);
+                    Response::new(Body::from(e.to_string()))
+                }
+            }
+        }
+
+        (&Method::POST, "/extensions") => {
+            info!("serving /extensions POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for extensions request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
+            let res = compute
+                .install_extension(&request.extension, &request.database, request.version)
+                .await;
+            match res {
+                Ok(version) => render_json(Body::from(
+                    serde_json::to_string(&ExtensionInstallResult {
+                        extension: request.extension,
+                        version,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => {
+                    error!("install_extension failed: {}", e);
+                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
+        (&Method::GET, "/info") => {
+            let num_cpus = num_cpus::get_physical();
+            info!("serving /info GET request. num_cpus: {}", num_cpus);
+            Response::new(Body::from(
+                serde_json::json!({
+                    "num_cpus": num_cpus,
+                })
+                .to_string(),
+            ))
+        }
+
+        // Accept spec in JSON format and request compute configuration. If
+        // anything goes wrong after we set the compute status to `ConfigurationPending`
+        // and update compute state with new spec, we basically leave compute
+        // in the potentially wrong state. That said, it's control-plane's
+        // responsibility to watch compute state after reconfiguration request
+        // and to clean restart in case of errors.
+        (&Method::POST, "/configure") => {
+            info!("serving /configure POST request");
+            match handle_configure_request(req, compute).await {
+                Ok(msg) => Response::new(Body::from(msg)),
+                Err((msg, code)) => {
+                    error!("error handling /configure request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
+        (&Method::GET, "/dbs_and_roles") => {
+            info!("serving /dbs_and_roles GET request",);
+            match get_dbs_and_roles(compute).await {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(_) => {
+                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
+        (&Method::GET, "/database_schema") => {
+            let database = match must_get_query_param(&req, "database") {
+                Err(e) => return e.into_response(),
+                Ok(database) => database,
+            };
+            info!("serving /database_schema GET request with database: {database}",);
+            match get_database_schema(compute, &database).await {
+                Ok(res) => render_plain(Body::wrap_stream(res)),
+                Err(SchemaDumpError::DatabaseDoesNotExist) => {
+                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
+                }
+                Err(e) => {
+                    error!("can't get schema dump: {}", e);
+                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
+        (&Method::POST, "/grants") => {
+            info!("serving /grants POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for set_role_grants request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
+
+            let res = compute
+                .set_role_grants(
+                    &request.database,
+                    &request.schema,
+                    &request.privileges,
+                    &request.role,
+                )
+                .await;
+            match res {
+                Ok(()) => render_json(Body::from(
+                    serde_json::to_string(&SetRoleGrantsResponse {
+                        database: request.database,
+                        schema: request.schema,
+                        role: request.role,
+                        privileges: request.privileges,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => render_json_error(
+                    &format!("could not grant role privileges to the schema: {e}"),
+                    // TODO: can we filter on role/schema not found errors
+                    // and return appropriate error code?
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                ),
+            }
+        }
+
+        // get the list of installed extensions
+        // currently only used in python tests
+        // TODO: call it from cplane
+        (&Method::GET, "/installed_extensions") => {
+            info!("serving /installed_extensions GET request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for extensions request: {:?}",
+                    status
+                );
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
+            let conf = compute.get_conn_conf(None);
+            let res =
+                task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
+                    .await
+                    .unwrap();
+
+            match res {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(e) => render_json_error(
+                    &format!("could not get list of installed extensions: {}", e),
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                ),
+            }
+        }
+
+        // download extension files from remote extension storage on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+
+            // don't even try to download extensions
+            // if no remote storage is configured
+            if compute.ext_remote_storage.is_none() {
+                info!("no extensions remote storage configured");
+                let mut resp = Response::new(Body::from("no remote storage configured"));
+                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return resp;
+            }
+
+            let mut is_library = false;
+            if let Some(params) = req.uri().query() {
+                info!("serving {:?} POST request with params: {}", route, params);
+                if params == "is_library=true" {
+                    is_library = true;
+                } else {
+                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    *resp.status_mut() = StatusCode::BAD_REQUEST;
+                    return resp;
+                }
+            }
+            let filename = route.split('/').last().unwrap().to_string();
+            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
+
+            // get ext_name and path from spec
+            // don't lock compute_state for too long
+            let ext = {
+                let compute_state = compute.state.lock().unwrap();
+                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+                let spec = &pspec.spec;
+
+                // debug only
+                info!("spec: {:?}", spec);
+
+                let remote_extensions = match spec.remote_extensions.as_ref() {
+                    Some(r) => r,
+                    None => {
+                        info!("no remote extensions spec was provided");
+                        let mut resp = Response::new(Body::from("no remote storage configured"));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        return resp;
+                    }
+                };
+
+                remote_extensions.get_ext(
+                    &filename,
+                    is_library,
+                    &compute.build_tag,
+                    &compute.pgversion,
+                )
+            };
+
+            match ext {
+                Ok((ext_name, ext_path)) => {
+                    match compute.download_extension(ext_name, ext_path).await {
+                        Ok(_) => Response::new(Body::from("OK")),
+                        Err(e) => {
+                            error!("extension download failed: {}", e);
+                            let mut resp = Response::new(Body::from(e.to_string()));
+                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                            resp
+                        }
+                    }
+                }
+                Err(e) => {
+                    warn!("extension download failed to find extension: {}", e);
+                    let mut resp = Response::new(Body::from("failed to find file"));
+                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                    resp
+                }
+            }
+        }
+
+        // Return the `404 Not Found` for any other routes.
+        _ => {
+            let mut not_found = Response::new(Body::from("404 Not Found"));
+            *not_found.status_mut() = StatusCode::NOT_FOUND;
+            not_found
+        }
+    }
+}
+
+async fn handle_configure_request(
+    req: Request<Body>,
+    compute: &Arc<ComputeNode>,
+) -> Result<String, (String, StatusCode)> {
+    if !compute.live_config_allowed {
+        return Err((
+            "live configuration is not allowed for this compute node".to_string(),
+            StatusCode::PRECONDITION_FAILED,
+        ));
+    }
+
+    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
+    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
+    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
+        let spec = request.spec;
+
+        let parsed_spec = match ParsedSpec::try_from(spec) {
+            Ok(ps) => ps,
+            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
+        };
+
+        // XXX: wrap state update under lock in code blocks. Otherwise,
+        // we will try to `Send` `mut state` into the spawned thread
+        // bellow, which will cause error:
+        // ```
+        // error: future cannot be sent between threads safely
+        // ```
+        {
+            let mut state = compute.state.lock().unwrap();
+            if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for configuration request: {:?}",
+                    state.status.clone()
+                );
+                return Err((msg, StatusCode::PRECONDITION_FAILED));
+            }
+            state.pspec = Some(parsed_spec);
+            state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
+            drop(state);
+            info!("set new spec and notified waiters");
+        }
+
+        // Spawn a blocking thread to wait for compute to become Running.
+        // This is needed to do not block the main pool of workers and
+        // be able to serve other requests while some particular request
+        // is waiting for compute to finish configuration.
+        let c = compute.clone();
+        task::spawn_blocking(move || {
+            let mut state = c.state.lock().unwrap();
+            while state.status != ComputeStatus::Running {
+                state = c.state_changed.wait(state).unwrap();
+                info!(
+                    "waiting for compute to become Running, current status: {:?}",
+                    state.status
+                );
+
+                if state.status == ComputeStatus::Failed {
+                    let err = state.error.as_ref().map_or("unknown error", |x| x);
+                    let msg = format!("compute configuration failed: {:?}", err);
+                    return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
+                }
+            }
+
+            Ok(())
+        })
+        .await
+        .unwrap()?;
+
+        // Return current compute state if everything went well.
+        let state = compute.state.lock().unwrap().clone();
+        let status_response = status_response_from_state(&state);
+        Ok(serde_json::to_string(&status_response).unwrap())
+    } else {
+        Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
+    }
+}
+
+fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
+    let error = GenericAPIError {
+        error: e.to_string(),
+    };
+    Response::builder()
+        .status(status)
+        .header(CONTENT_TYPE, "application/json")
+        .body(Body::from(serde_json::to_string(&error).unwrap()))
+        .unwrap()
+}
+
+fn render_json(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "application/json")
+        .body(body)
+        .unwrap()
+}
+
+fn render_plain(body: Body) -> Response<Body> {
+    Response::builder()
+        .header(CONTENT_TYPE, "text/plain")
+        .body(body)
+        .unwrap()
+}
+
+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {}",
+                state.status
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
+        drop(state);
+    }
+
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become {}, current status: {:?}",
+                ComputeStatus::Terminated,
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
+// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
+#[tokio::main]
+async fn serve(port: u16, state: Arc<ComputeNode>) {
+    // this usually binds to both IPv4 and IPv6 on linux
+    // see e.g. https://github.com/rust-lang/rust/pull/34440
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
+
+    let make_service = make_service_fn(move |_conn| {
+        let state = state.clone();
+        async move {
+            Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
+                let state = state.clone();
+                async move {
+                    Ok::<_, Infallible>(
+                        // NOTE: We include the URI path in the string. It
+                        // doesn't contain any variable parts or sensitive
+                        // information in this API.
+                        tracing_utils::http::tracing_handler(
+                            req,
+                            |req| routes(req, &state),
+                            OtelName::UriPath,
+                        )
+                        .await,
+                    )
+                }
+            }))
+        }
+    });
+
+    info!("starting HTTP server on {}", addr);
+
+    let server = Server::bind(&addr).serve(make_service);
+
+    // Run this server forever
+    if let Err(e) = server.await {
+        error!("server error: {}", e);
+    }
+}
+
+/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("http-endpoint".into())
+        .spawn(move || serve(port, state))?)
+}
--- a/compute_tools/src/http/extract/json.rs
+++ b/compute_tools/src/http/extract/json.rs
@@ -1,48 +0,0 @@
-use std::ops::{Deref, DerefMut};
-
-use axum::{
-    async_trait,
-    extract::{rejection::JsonRejection, FromRequest, Request},
-};
-use compute_api::responses::GenericAPIError;
-use http::StatusCode;
-
-/// Custom `Json` extractor, so that we can format errors into
-/// `JsonResponse<GenericAPIError>`.
-#[derive(Debug, Clone, Copy, Default)]
-pub(crate) struct Json<T>(pub T);
-
-#[async_trait]
-impl<S, T> FromRequest<S> for Json<T>
-where
-    axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
-    S: Send + Sync,
-{
-    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
-
-    async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
-        match axum::Json::<T>::from_request(req, state).await {
-            Ok(value) => Ok(Self(value.0)),
-            Err(rejection) => Err((
-                rejection.status(),
-                axum::Json(GenericAPIError {
-                    error: rejection.body_text().to_lowercase(),
-                }),
-            )),
-        }
-    }
-}
-
-impl<T> Deref for Json<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl<T> DerefMut for Json<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -1,7 +0,0 @@
-pub(crate) mod json;
-pub(crate) mod path;
-pub(crate) mod query;
-
-pub(crate) use json::Json;
-pub(crate) use path::Path;
-pub(crate) use query::Query;
--- a/compute_tools/src/http/extract/path.rs
+++ b/compute_tools/src/http/extract/path.rs
@@ -1,48 +0,0 @@
-use std::ops::{Deref, DerefMut};
-
-use axum::{
-    async_trait,
-    extract::{rejection::PathRejection, FromRequestParts},
-};
-use compute_api::responses::GenericAPIError;
-use http::{request::Parts, StatusCode};
-
-/// Custom `Path` extractor, so that we can format errors into
-/// `JsonResponse<GenericAPIError>`.
-#[derive(Debug, Clone, Copy, Default)]
-pub(crate) struct Path<T>(pub T);
-
-#[async_trait]
-impl<S, T> FromRequestParts<S> for Path<T>
-where
-    axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
-    S: Send + Sync,
-{
-    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
-
-    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
-        match axum::extract::Path::<T>::from_request_parts(parts, state).await {
-            Ok(value) => Ok(Self(value.0)),
-            Err(rejection) => Err((
-                rejection.status(),
-                axum::Json(GenericAPIError {
-                    error: rejection.body_text().to_ascii_lowercase(),
-                }),
-            )),
-        }
-    }
-}
-
-impl<T> Deref for Path<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl<T> DerefMut for Path<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
--- a/compute_tools/src/http/extract/query.rs
+++ b/compute_tools/src/http/extract/query.rs
@@ -1,48 +0,0 @@
-use std::ops::{Deref, DerefMut};
-
-use axum::{
-    async_trait,
-    extract::{rejection::QueryRejection, FromRequestParts},
-};
-use compute_api::responses::GenericAPIError;
-use http::{request::Parts, StatusCode};
-
-/// Custom `Query` extractor, so that we can format errors into
-/// `JsonResponse<GenericAPIError>`.
-#[derive(Debug, Clone, Copy, Default)]
-pub(crate) struct Query<T>(pub T);
-
-#[async_trait]
-impl<S, T> FromRequestParts<S> for Query<T>
-where
-    axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
-    S: Send + Sync,
-{
-    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
-
-    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
-        match axum::extract::Query::<T>::from_request_parts(parts, state).await {
-            Ok(value) => Ok(Self(value.0)),
-            Err(rejection) => Err((
-                rejection.status(),
-                axum::Json(GenericAPIError {
-                    error: rejection.body_text().to_ascii_lowercase(),
-                }),
-            )),
-        }
-    }
-}
-
-impl<T> Deref for Query<T> {
-    type Target = T;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl<T> DerefMut for Query<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -1,56 +1 @@
-use axum::{body::Body, response::Response};
-use compute_api::responses::{ComputeStatus, GenericAPIError};
-use http::{header::CONTENT_TYPE, StatusCode};
-use serde::Serialize;
-use tracing::error;
-
-pub use server::launch_http_server;
-
-mod extract;
-mod routes;
-mod server;
-
-/// Convenience response builder for JSON responses
-struct JsonResponse;
-
-impl JsonResponse {
-    /// Helper for actually creating a response
-    fn create_response(code: StatusCode, body: impl Serialize) -> Response {
-        Response::builder()
-            .status(code)
-            .header(CONTENT_TYPE.as_str(), "application/json")
-            .body(Body::from(serde_json::to_string(&body).unwrap()))
-            .unwrap()
-    }
-
-    /// Create a successful error response
-    pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response {
-        assert!({
-            let code = code.as_u16();
-
-            (200..300).contains(&code)
-        });
-
-        Self::create_response(code, body)
-    }
-
-    /// Create an error response
-    pub(self) fn error(code: StatusCode, error: impl ToString) -> Response {
-        assert!(code.as_u16() >= 400);
-
-        let message = error.to_string();
-        error!(message);
-
-        Self::create_response(code, &GenericAPIError { error: message })
-    }
-
-    /// Create an error response related to the compute being in an invalid state
-    pub(self) fn invalid_status(status: ComputeStatus) -> Response {
-        Self::create_response(
-            StatusCode::PRECONDITION_FAILED,
-            &GenericAPIError {
-                error: format!("invalid compute status: {status}"),
-            },
-        )
-    }
-}
+pub mod api;
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,7 +37,7 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

-  /metrics:
+  /metrics
    get:
      tags:
      - Info
--- a/compute_tools/src/http/routes/check_writability.rs
+++ b/compute_tools/src/http/routes/check_writability.rs
@@ -1,20 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::responses::ComputeStatus;
-use http::StatusCode;
-
-use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
-
-/// Check that the compute is currently running.
-pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    match check_writability(&compute).await {
-        Ok(_) => JsonResponse::success(StatusCode::OK, true),
-        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
-    }
-}
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -1,91 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::{
-    requests::ConfigurationRequest,
-    responses::{ComputeStatus, ComputeStatusResponse},
-};
-use http::StatusCode;
-use tokio::task;
-use tracing::info;
-
-use crate::{
-    compute::{ComputeNode, ParsedSpec},
-    http::{extract::Json, JsonResponse},
-};
-
-// Accept spec in JSON format and request compute configuration. If anything
-// goes wrong after we set the compute status to `ConfigurationPending` and
-// update compute state with new spec, we basically leave compute in the
-// potentially wrong state. That said, it's control-plane's responsibility to
-// watch compute state after reconfiguration request and to clean restart in
-// case of errors.
-pub(in crate::http) async fn configure(
-    State(compute): State<Arc<ComputeNode>>,
-    request: Json<ConfigurationRequest>,
-) -> Response {
-    if !compute.live_config_allowed {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "live configuration is not allowed for this compute node".to_string(),
-        );
-    }
-
-    let pspec = match ParsedSpec::try_from(request.spec.clone()) {
-        Ok(p) => p,
-        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
-    };
-
-    // XXX: wrap state update under lock in a code block. Otherwise, we will try
-    // to `Send` `mut state` into the spawned thread bellow, which will cause
-    // the following rustc error:
-    //
-    // error: future cannot be sent between threads safely
-    {
-        let mut state = compute.state.lock().unwrap();
-        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
-            return JsonResponse::invalid_status(state.status);
-        }
-
-        state.pspec = Some(pspec);
-        state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
-        drop(state);
-    }
-
-    // Spawn a blocking thread to wait for compute to become Running. This is
-    // needed to do not block the main pool of workers and be able to serve
-    // other requests while some particular request is waiting for compute to
-    // finish configuration.
-    let c = compute.clone();
-    let completed = task::spawn_blocking(move || {
-        let mut state = c.state.lock().unwrap();
-        while state.status != ComputeStatus::Running {
-            state = c.state_changed.wait(state).unwrap();
-            info!(
-                "waiting for compute to become {}, current status: {}",
-                ComputeStatus::Running,
-                state.status
-            );
-
-            if state.status == ComputeStatus::Failed {
-                let err = state.error.as_ref().map_or("unknown error", |x| x);
-                let msg = format!("compute configuration failed: {:?}", err);
-                return Err(msg);
-            }
-        }
-
-        Ok(())
-    })
-    .await
-    .unwrap();
-
-    if let Err(e) = completed {
-        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
-    }
-
-    // Return current compute state if everything went well.
-    let state = compute.state.lock().unwrap().clone();
-    let body = ComputeStatusResponse::from(&state);
-
-    JsonResponse::success(StatusCode::OK, body)
-}
--- a/compute_tools/src/http/routes/database_schema.rs
+++ b/compute_tools/src/http/routes/database_schema.rs
@@ -1,34 +0,0 @@
-use std::sync::Arc;
-
-use axum::{body::Body, extract::State, response::Response};
-use http::{header::CONTENT_TYPE, StatusCode};
-use serde::Deserialize;
-
-use crate::{
-    catalog::{get_database_schema, SchemaDumpError},
-    compute::ComputeNode,
-    http::{extract::Query, JsonResponse},
-};
-
-#[derive(Debug, Clone, Deserialize)]
-pub(in crate::http) struct DatabaseSchemaParams {
-    database: String,
-}
-
-/// Get a schema dump of the requested database.
-pub(in crate::http) async fn get_schema_dump(
-    params: Query<DatabaseSchemaParams>,
-    State(compute): State<Arc<ComputeNode>>,
-) -> Response {
-    match get_database_schema(&compute, &params.database).await {
-        Ok(schema) => Response::builder()
-            .status(StatusCode::OK)
-            .header(CONTENT_TYPE.as_str(), "application/json")
-            .body(Body::from_stream(schema))
-            .unwrap(),
-        Err(SchemaDumpError::DatabaseDoesNotExist) => {
-            JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist)
-        }
-        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
-    }
-}
--- a/compute_tools/src/http/routes/dbs_and_roles.rs
+++ b/compute_tools/src/http/routes/dbs_and_roles.rs
@@ -1,16 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use http::StatusCode;
-
-use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
-
-/// Get the databases and roles from the compute.
-pub(in crate::http) async fn get_catalog_objects(
-    State(compute): State<Arc<ComputeNode>>,
-) -> Response {
-    match get_dbs_and_roles(&compute).await {
-        Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects),
-        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
-    }
-}
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -1,67 +0,0 @@
-use std::sync::Arc;
-
-use axum::{
-    extract::State,
-    response::{IntoResponse, Response},
-};
-use http::StatusCode;
-use serde::Deserialize;
-
-use crate::{
-    compute::ComputeNode,
-    http::{
-        extract::{Path, Query},
-        JsonResponse,
-    },
-};
-
-#[derive(Debug, Clone, Deserialize)]
-pub(in crate::http) struct ExtensionServerParams {
-    is_library: Option<bool>,
-}
-
-/// Download a remote extension.
-pub(in crate::http) async fn download_extension(
-    Path(filename): Path<String>,
-    params: Query<ExtensionServerParams>,
-    State(compute): State<Arc<ComputeNode>>,
-) -> Response {
-    // Don't even try to download extensions if no remote storage is configured
-    if compute.ext_remote_storage.is_none() {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "remote storage is not configured",
-        );
-    }
-
-    let ext = {
-        let state = compute.state.lock().unwrap();
-        let pspec = state.pspec.as_ref().unwrap();
-        let spec = &pspec.spec;
-
-        let remote_extensions = match spec.remote_extensions.as_ref() {
-            Some(r) => r,
-            None => {
-                return JsonResponse::error(
-                    StatusCode::CONFLICT,
-                    "information about remote extensions is unavailable",
-                );
-            }
-        };
-
-        remote_extensions.get_ext(
-            &filename,
-            params.is_library.unwrap_or(false),
-            &compute.build_tag,
-            &compute.pgversion,
-        )
-    };
-
-    match ext {
-        Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await {
-            Ok(_) => StatusCode::OK.into_response(),
-            Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
-        },
-        Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e),
-    }
-}
--- a/compute_tools/src/http/routes/extensions.rs
+++ b/compute_tools/src/http/routes/extensions.rs
@@ -1,45 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::{
-    requests::ExtensionInstallRequest,
-    responses::{ComputeStatus, ExtensionInstallResponse},
-};
-use http::StatusCode;
-
-use crate::{
-    compute::ComputeNode,
-    http::{extract::Json, JsonResponse},
-};
-
-/// Install a extension.
-pub(in crate::http) async fn install_extension(
-    State(compute): State<Arc<ComputeNode>>,
-    request: Json<ExtensionInstallRequest>,
-) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    match compute
-        .install_extension(
-            &request.extension,
-            &request.database,
-            request.version.to_string(),
-        )
-        .await
-    {
-        Ok(version) => JsonResponse::success(
-            StatusCode::CREATED,
-            Some(ExtensionInstallResponse {
-                extension: request.extension.clone(),
-                version,
-            }),
-        ),
-        Err(e) => JsonResponse::error(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            format!("failed to install extension: {e}"),
-        ),
-    }
-}
--- a/compute_tools/src/http/routes/failpoints.rs
+++ b/compute_tools/src/http/routes/failpoints.rs
@@ -1,35 +0,0 @@
-use axum::response::{IntoResponse, Response};
-use http::StatusCode;
-use tracing::info;
-use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
-
-use crate::http::{extract::Json, JsonResponse};
-
-/// Configure failpoints for testing purposes.
-pub(in crate::http) async fn configure_failpoints(
-    failpoints: Json<ConfigureFailpointsRequest>,
-) -> Response {
-    if !fail::has_failpoints() {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "Cannot manage failpoints because neon was compiled without failpoints support",
-        );
-    }
-
-    for fp in &*failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(e) = cfg_result {
-            return JsonResponse::error(
-                StatusCode::BAD_REQUEST,
-                format!("failed to configure failpoints: {e}"),
-            );
-        }
-    }
-
-    StatusCode::OK.into_response()
-}
--- a/compute_tools/src/http/routes/grants.rs
+++ b/compute_tools/src/http/routes/grants.rs
@@ -1,48 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::{
-    requests::SetRoleGrantsRequest,
-    responses::{ComputeStatus, SetRoleGrantsResponse},
-};
-use http::StatusCode;
-
-use crate::{
-    compute::ComputeNode,
-    http::{extract::Json, JsonResponse},
-};
-
-/// Add grants for a role.
-pub(in crate::http) async fn add_grant(
-    State(compute): State<Arc<ComputeNode>>,
-    request: Json<SetRoleGrantsRequest>,
-) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    match compute
-        .set_role_grants(
-            &request.database,
-            &request.schema,
-            &request.privileges,
-            &request.role,
-        )
-        .await
-    {
-        Ok(()) => JsonResponse::success(
-            StatusCode::CREATED,
-            Some(SetRoleGrantsResponse {
-                database: request.database.clone(),
-                schema: request.schema.clone(),
-                role: request.role.clone(),
-                privileges: request.privileges.clone(),
-            }),
-        ),
-        Err(e) => JsonResponse::error(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            format!("failed to grant role privileges to the schema: {e}"),
-        ),
-    }
-}
--- a/compute_tools/src/http/routes/info.rs
+++ b/compute_tools/src/http/routes/info.rs
@@ -1,11 +0,0 @@
-use axum::response::Response;
-use compute_api::responses::InfoResponse;
-use http::StatusCode;
-
-use crate::http::JsonResponse;
-
-/// Get information about the physical characteristics about the compute.
-pub(in crate::http) async fn get_info() -> Response {
-    let num_cpus = num_cpus::get_physical();
-    JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
-}
--- a/compute_tools/src/http/routes/insights.rs
+++ b/compute_tools/src/http/routes/insights.rs
@@ -1,18 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::responses::ComputeStatus;
-use http::StatusCode;
-
-use crate::{compute::ComputeNode, http::JsonResponse};
-
-/// Collect current Postgres usage insights.
-pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    let insights = compute.collect_insights().await;
-    JsonResponse::success(StatusCode::OK, insights)
-}
--- a/compute_tools/src/http/routes/installed_extensions.rs
+++ b/compute_tools/src/http/routes/installed_extensions.rs
@@ -1,33 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use compute_api::responses::ComputeStatus;
-use http::StatusCode;
-use tokio::task;
-
-use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
-
-/// Get a list of installed extensions.
-pub(in crate::http) async fn get_installed_extensions(
-    State(compute): State<Arc<ComputeNode>>,
-) -> Response {
-    let status = compute.get_status();
-    if status != ComputeStatus::Running {
-        return JsonResponse::invalid_status(status);
-    }
-
-    let conf = compute.get_conn_conf(None);
-    let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
-        .await
-        .unwrap();
-
-    match res {
-        Ok(installed_extensions) => {
-            JsonResponse::success(StatusCode::OK, Some(installed_extensions))
-        }
-        Err(e) => JsonResponse::error(
-            StatusCode::INTERNAL_SERVER_ERROR,
-            format!("failed to get list of installed extensions: {e}"),
-        ),
-    }
-}
--- a/compute_tools/src/http/routes/metrics.rs
+++ b/compute_tools/src/http/routes/metrics.rs
@@ -1,32 +0,0 @@
-use axum::{body::Body, response::Response};
-use http::header::CONTENT_TYPE;
-use http::StatusCode;
-use metrics::proto::MetricFamily;
-use metrics::Encoder;
-use metrics::TextEncoder;
-
-use crate::{http::JsonResponse, installed_extensions};
-
-/// Expose Prometheus metrics.
-pub(in crate::http) async fn get_metrics() -> Response {
-    // When we call TextEncoder::encode() below, it will immediately return an
-    // error if a metric family has no metrics, so we need to preemptively
-    // filter out metric families with no metrics.
-    let metrics = installed_extensions::collect()
-        .into_iter()
-        .filter(|m| !m.get_metric().is_empty())
-        .collect::<Vec<MetricFamily>>();
-
-    let encoder = TextEncoder::new();
-    let mut buffer = vec![];
-
-    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
-        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
-    }
-
-    Response::builder()
-        .status(StatusCode::OK)
-        .header(CONTENT_TYPE, encoder.format_type())
-        .body(Body::from(buffer))
-        .unwrap()
-}
--- a/compute_tools/src/http/routes/metrics_json.rs
+++ b/compute_tools/src/http/routes/metrics_json.rs
@@ -1,12 +0,0 @@
-use std::sync::Arc;
-
-use axum::{extract::State, response::Response};
-use http::StatusCode;
-
-use crate::{compute::ComputeNode, http::JsonResponse};
-
-/// Get startup metrics.
-pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
-    let metrics = compute.state.lock().unwrap().metrics.clone();
-    JsonResponse::success(StatusCode::OK, metrics)
-}
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -1,38 +0,0 @@
-use compute_api::responses::ComputeStatusResponse;
-
-use crate::compute::ComputeState;
-
-pub(in crate::http) mod check_writability;
-pub(in crate::http) mod configure;
-pub(in crate::http) mod database_schema;
-pub(in crate::http) mod dbs_and_roles;
-pub(in crate::http) mod extension_server;
-pub(in crate::http) mod extensions;
-pub(in crate::http) mod failpoints;
-pub(in crate::http) mod grants;
-pub(in crate::http) mod info;
-pub(in crate::http) mod insights;
-pub(in crate::http) mod installed_extensions;
-pub(in crate::http) mod metrics;
-pub(in crate::http) mod metrics_json;
-pub(in crate::http) mod status;
-pub(in crate::http) mod terminate;
-
-impl From<&ComputeState> for ComputeStatusResponse {
-    fn from(state: &ComputeState) -> Self {
-        ComputeStatusResponse {
-            start_time: state.start_time,
-            tenant: state
-                .pspec
-                .as_ref()
-                .map(|pspec| pspec.tenant_id.to_string()),
-            timeline: state
-                .pspec
-                .as_ref()
-                .map(|pspec| pspec.timeline_id.to_string()),
-            status: state.status,
-            last_active: state.last_active,
-            error: state.error.clone(),
-        }
-    }
-}
--- a/compute_tools/src/http/routes/status.rs
+++ b/compute_tools/src/http/routes/status.rs
@@ -1,14 +0,0 @@
-use std::{ops::Deref, sync::Arc};
-
-use axum::{extract::State, http::StatusCode, response::Response};
-use compute_api::responses::ComputeStatusResponse;
-
-use crate::{compute::ComputeNode, http::JsonResponse};
-
-/// Retrieve the state of the comute.
-pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
-    let state = compute.state.lock().unwrap();
-    let body = ComputeStatusResponse::from(state.deref());
-
-    JsonResponse::success(StatusCode::OK, body)
-}
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -1,58 +0,0 @@
-use std::sync::Arc;
-
-use axum::{
-    extract::State,
-    response::{IntoResponse, Response},
-};
-use compute_api::responses::ComputeStatus;
-use http::StatusCode;
-use tokio::task;
-use tracing::info;
-
-use crate::{
-    compute::{forward_termination_signal, ComputeNode},
-    http::JsonResponse,
-};
-
-/// Terminate the compute.
-pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
-    {
-        let mut state = compute.state.lock().unwrap();
-        if state.status == ComputeStatus::Terminated {
-            return StatusCode::CREATED.into_response();
-        }
-
-        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
-            return JsonResponse::invalid_status(state.status);
-        }
-
-        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
-        drop(state);
-    }
-
-    forward_termination_signal();
-    info!("sent signal and notified waiters");
-
-    // Spawn a blocking thread to wait for compute to become Terminated.
-    // This is needed to do not block the main pool of workers and
-    // be able to serve other requests while some particular request
-    // is waiting for compute to finish configuration.
-    let c = compute.clone();
-    task::spawn_blocking(move || {
-        let mut state = c.state.lock().unwrap();
-        while state.status != ComputeStatus::Terminated {
-            state = c.state_changed.wait(state).unwrap();
-            info!(
-                "waiting for compute to become {}, current status: {:?}",
-                ComputeStatus::Terminated,
-                state.status
-            );
-        }
-    })
-    .await
-    .unwrap();
-
-    info!("terminated Postgres");
-
-    StatusCode::OK.into_response()
-}
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -1,165 +0,0 @@
-use std::{
-    net::{IpAddr, Ipv6Addr, SocketAddr},
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
-    thread,
-    time::Duration,
-};
-
-use anyhow::Result;
-use axum::{
-    response::{IntoResponse, Response},
-    routing::{get, post},
-    Router,
-};
-use http::StatusCode;
-use tokio::net::TcpListener;
-use tower::ServiceBuilder;
-use tower_http::{
-    request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
-    trace::TraceLayer,
-};
-use tracing::{debug, error, info, Span};
-
-use super::routes::{
-    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-    grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
-    terminate,
-};
-use crate::compute::ComputeNode;
-
-async fn handle_404() -> Response {
-    StatusCode::NOT_FOUND.into_response()
-}
-
-#[derive(Clone, Default)]
-struct ComputeMakeRequestId(Arc<AtomicU64>);
-
-impl MakeRequestId for ComputeMakeRequestId {
-    fn make_request_id<B>(
-        &mut self,
-        _request: &http::Request<B>,
-    ) -> Option<tower_http::request_id::RequestId> {
-        let request_id = self
-            .0
-            .fetch_add(1, Ordering::SeqCst)
-            .to_string()
-            .parse()
-            .unwrap();
-
-        Some(RequestId::new(request_id))
-    }
-}
-
-/// Run the HTTP server and wait on it forever.
-#[tokio::main]
-async fn serve(port: u16, compute: Arc<ComputeNode>) {
-    const X_REQUEST_ID: &str = "x-request-id";
-
-    let mut app = Router::new()
-        .route("/check_writability", post(check_writability::is_writable))
-        .route("/configure", post(configure::configure))
-        .route("/database_schema", get(database_schema::get_schema_dump))
-        .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
-        .route(
-            "/extension_server/*filename",
-            post(extension_server::download_extension),
-        )
-        .route("/extensions", post(extensions::install_extension))
-        .route("/grants", post(grants::add_grant))
-        .route("/info", get(info_route::get_info))
-        .route("/insights", get(insights::get_insights))
-        .route(
-            "/installed_extensions",
-            get(installed_extensions::get_installed_extensions),
-        )
-        .route("/metrics", get(metrics::get_metrics))
-        .route("/metrics.json", get(metrics_json::get_metrics))
-        .route("/status", get(status::get_status))
-        .route("/terminate", post(terminate::terminate))
-        .fallback(handle_404)
-        .layer(
-            ServiceBuilder::new()
-                .layer(SetRequestIdLayer::x_request_id(
-                    ComputeMakeRequestId::default(),
-                ))
-                .layer(
-                    TraceLayer::new_for_http()
-                        .on_request(|request: &http::Request<_>, _span: &Span| {
-                            let request_id = request
-                                .headers()
-                                .get(X_REQUEST_ID)
-                                .unwrap()
-                                .to_str()
-                                .unwrap();
-
-                            match request.uri().path() {
-                                "/metrics" => {
-                                    debug!(%request_id, "{} {}", request.method(), request.uri())
-                                }
-                                _ => info!(%request_id, "{} {}", request.method(), request.uri()),
-                            };
-                        })
-                        .on_response(
-                            |response: &http::Response<_>, latency: Duration, _span: &Span| {
-                                let request_id = response
-                                    .headers()
-                                    .get(X_REQUEST_ID)
-                                    .unwrap()
-                                    .to_str()
-                                    .unwrap();
-
-                                info!(
-                                    %request_id,
-                                    code = response.status().as_u16(),
-                                    latency = latency.as_millis()
-                                )
-                            },
-                        ),
-                )
-                .layer(PropagateRequestIdLayer::x_request_id()),
-        )
-        .with_state(compute);
-
-    // Add in any testing support
-    if cfg!(feature = "testing") {
-        use super::routes::failpoints;
-
-        app = app.route("/failpoints", post(failpoints::configure_failpoints))
-    }
-
-    // This usually binds to both IPv4 and IPv6 on Linux, see
-    // https://github.com/rust-lang/rust/pull/34440 for more information
-    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
-    let listener = match TcpListener::bind(&addr).await {
-        Ok(listener) => listener,
-        Err(e) => {
-            error!(
-                "failed to bind the compute_ctl HTTP server to port {}: {}",
-                port, e
-            );
-            return;
-        }
-    };
-
-    if let Ok(local_addr) = listener.local_addr() {
-        info!("compute_ctl HTTP server listening on {}", local_addr);
-    } else {
-        info!("compute_ctl HTTP server listening on port {}", port);
-    }
-
-    if let Err(e) = axum::serve(listener, app).await {
-        error!("compute_ctl HTTP server error: {}", e);
-    }
-}
-
-/// Launch a separate HTTP server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
-    let state = Arc::clone(state);
-
-    Ok(thread::Builder::new()
-        .name("http-server".into())
-        .spawn(move || serve(port, state))?)
-}
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -3,6 +3,8 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]

+extern crate hyper0 as hyper;
+
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,16 +1,13 @@
 use anyhow::{Context, Result};
-use fail::fail_point;
-use postgres::{Client, Transaction};
+use postgres::Client;
 use tracing::info;

-/// Runs a series of migrations on a target database
 pub(crate) struct MigrationRunner<'m> {
    client: &'m mut Client,
    migrations: &'m [&'m str],
 }

 impl<'m> MigrationRunner<'m> {
-    /// Create a new migration runner
    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
        assert!(migrations.len() + 1 < i64::MAX as usize);
@@ -18,110 +15,87 @@ impl<'m> MigrationRunner<'m> {
        Self { client, migrations }
    }

-    /// Get the current value neon_migration.migration_id
    fn get_migration_id(&mut self) -> Result<i64> {
+        let query = "SELECT id FROM neon_migration.migration_id";
        let row = self
            .client
-            .query_one("SELECT id FROM neon_migration.migration_id", &[])?;
+            .query_one(query, &[])
+            .context("run_migrations get migration_id")?;

        Ok(row.get::<&str, i64>("id"))
    }

-    /// Update the neon_migration.migration_id value
-    ///
-    /// This function has a fail point called compute-migration, which can be
-    /// used if you would like to fail the application of a series of migrations
-    /// at some point.
-    fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
-        // We use this fail point in order to check that failing in the
-        // middle of applying a series of migrations fails in an expected
-        // manner
-        if cfg!(feature = "testing") {
-            let fail = (|| {
-                fail_point!("compute-migration", |fail_migration_id| {
-                    migration_id == fail_migration_id.unwrap().parse::<i64>().unwrap()
-                });
+    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+        let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);

-                false
-            })();
-
-            if fail {
-                return Err(anyhow::anyhow!(format!(
-                    "migration {} was configured to fail because of a failpoint",
-                    migration_id
-                )));
-            }
-        }
-
-        txn.query(
-            "UPDATE neon_migration.migration_id SET id = $1",
-            &[&migration_id],
-        )
-        .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;
+        self.client
+            .simple_query(&setval)
+            .context("run_migrations update id")?;

        Ok(())
    }

-    /// Prepare the migrations the target database for handling migrations
-    fn prepare_database(&mut self) -> Result<()> {
-        self.client
-            .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")?;
-        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)")?;
-        self.client.simple_query(
-            "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
-        )?;
-        self.client
-            .simple_query("ALTER SCHEMA neon_migration OWNER TO cloud_admin")?;
-        self.client
-            .simple_query("REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC")?;
+    fn prepare_migrations(&mut self) -> Result<()> {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        self.client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        self.client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        self.client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        self.client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        self.client.simple_query(query)?;

        Ok(())
    }

-    /// Run an individual migration
-    fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", migration_id);
-
-            // Even though we are skipping the migration, updating the
-            // migration ID should help keep logic easy to understand when
-            // trying to understand the state of a cluster.
-            Self::update_migration_id(txn, migration_id)?;
-        } else {
-            info!("Running migration id={}:\n{}\n", migration_id, migration);
-
-            txn.simple_query(migration)
-                .with_context(|| format!("apply migration {migration_id}"))?;
-
-            Self::update_migration_id(txn, migration_id)?;
-        }
-
-        Ok(())
-    }
-
-    /// Run the configured set of migrations
    pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_database()
-            .context("prepare database to handle migrations")?;
+        self.prepare_migrations()?;

        let mut current_migration = self.get_migration_id()? as usize;
        while current_migration < self.migrations.len() {
-            // The index lags the migration ID by 1, so the current migration
-            // ID is also the next index
-            let migration_id = (current_migration + 1) as i64;
+            macro_rules! migration_id {
+                ($cm:expr) => {
+                    ($cm + 1) as i64
+                };
+            }

-            let mut txn = self
-                .client
-                .transaction()
-                .with_context(|| format!("begin transaction for migration {migration_id}"))?;
+            let migration = self.migrations[current_migration];

-            Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
-                .with_context(|| format!("running migration {migration_id}"))?;
+            if migration.starts_with("-- SKIP") {
+                info!("Skipping migration id={}", migration_id!(current_migration));
+            } else {
+                info!(
+                    "Running migration id={}:\n{}\n",
+                    migration_id!(current_migration),
+                    migration
+                );

-            txn.commit()
-                .with_context(|| format!("commit transaction for migration {migration_id}"))?;
+                self.client
+                    .simple_query("BEGIN")
+                    .context("begin migration")?;

-            info!("Finished migration id={}", migration_id);
+                self.client.simple_query(migration).with_context(|| {
+                    format!(
+                        "run_migrations migration id={}",
+                        migration_id!(current_migration)
+                    )
+                })?;
+
+                // Migration IDs start at 1
+                self.update_migration_id(migration_id!(current_migration))?;
+
+                self.client
+                    .simple_query("COMMIT")
+                    .context("commit migration")?;
+
+                info!("Finished migration id={}", migration_id!(current_migration));
+            }

            current_migration += 1;
        }
--- a/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql
@@ -1,9 +0,0 @@
-DO $$
-DECLARE
-    bypassrls boolean;
-BEGIN
-    SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
-    IF NOT bypassrls THEN
-        RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
-    END IF;
-END $$;
--- a/compute_tools/src/migrations/tests/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql
@@ -1,25 +0,0 @@
-DO $$
-DECLARE
-    role record;
-BEGIN
-    FOR role IN
-        SELECT rolname AS name, rolinherit AS inherit
-        FROM pg_roles
-        WHERE pg_has_role(rolname, 'neon_superuser', 'member')
-    LOOP
-        IF NOT role.inherit THEN
-            RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
-        END IF;
-    END LOOP;
-
-    FOR role IN
-        SELECT rolname AS name, rolbypassrls AS bypassrls
-        FROM pg_roles
-        WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
-            AND NOT starts_with(rolname, 'pg_')
-    LOOP
-        IF role.bypassrls THEN
-            RAISE EXCEPTION  '% can bypass RLS', quote_ident(role.name);
-        END IF;
-    END LOOP;
-END $$;
--- a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql
@@ -1,10 +0,0 @@
-DO $$
-BEGIN
-    IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
-        RETURN;
-    END IF;
-
-    IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
-        RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
-    END IF;
-END $$;
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -1,19 +0,0 @@
-DO $$
-DECLARE
-    monitor record;
-BEGIN
-    SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
-            admin_option AS admin
-        INTO monitor
-        FROM pg_auth_members
-        WHERE roleid = 'pg_monitor'::regrole
-            AND member = 'pg_monitor'::regrole;
-
-    IF NOT monitor.member THEN
-        RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
-    END IF;
-
-    IF NOT monitor.admin THEN
-        RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
-    END IF;
-END $$;
--- a/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql
@@ -1,2 +0,0 @@
-- This test was never written becuase at the time migration tests were added
-- the accompanying migration was already skipped.
--- a/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql
@@ -1,2 +0,0 @@
-- This test was never written becuase at the time migration tests were added
-- the accompanying migration was already skipped.
--- a/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
@@ -1,2 +0,0 @@
-- This test was never written becuase at the time migration tests were added
-- the accompanying migration was already skipped.
--- a/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
@@ -1,2 +0,0 @@
-- This test was never written becuase at the time migration tests were added
-- the accompanying migration was already skipped.
--- a/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/tests/0009-revoke_replication_for_previously_allowed_roles.sql
@@ -1,2 +0,0 @@
-- This test was never written becuase at the time migration tests were added
-- the accompanying migration was already skipped.
--- a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
@@ -1,13 +0,0 @@
-DO $$
-DECLARE
-    can_execute boolean;
-BEGIN
-    SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
-       INTO can_execute
-       FROM pg_proc
-       WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
-           AND pronamespace = 'pg_catalog'::regnamespace;
-    IF NOT can_execute THEN
-        RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
-    END IF;
-END $$;
--- a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
@@ -1,13 +0,0 @@
-DO $$
-DECLARE
-    can_execute boolean;
-BEGIN
-    SELECT has_function_privilege('neon_superuser', oid, 'execute')
-       INTO can_execute
-       FROM pg_proc
-       WHERE proname = 'pg_show_replication_origin_status'
-           AND pronamespace = 'pg_catalog'::regnamespace;
-    IF NOT can_execute THEN
-        RAISE EXCEPTION 'neon_superuser cannot execute pg_show_replication_origin_status';
-    END IF;
-END $$;
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -47,7 +47,6 @@ pub enum PerDatabasePhase {
    DeleteDBRoleReferences,
    ChangeSchemaPerms,
    HandleAnonExtension,
-    DropSubscriptionsForDeletedDatabases,
 }

 #[derive(Clone, Debug)]
@@ -75,7 +74,7 @@ pub struct MutableApplyContext {
    pub dbs: HashMap<String, Database>,
 }

-/// Apply the operations that belong to the given spec apply phase.
+/// Appply the operations that belong to the given spec apply phase.
 ///
 /// Commands within a single phase are executed in order of Iterator yield.
 /// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
@@ -327,12 +326,13 @@ async fn get_operations<'a>(

                            // Use FORCE to drop database even if there are active connections.
                            // We run this from `cloud_admin`, so it should have enough privileges.
-                            //
                            // NB: there could be other db states, which prevent us from dropping
                            // the database. For example, if db is used by any active subscription
                            // or replication slot.
-                            // Such cases are handled in the DropSubscriptionsForDeletedDatabases
-                            // phase. We do all the cleanup before actually dropping the database.
+                            // TODO: deal with it once we allow logical replication. Proper fix should
+                            // involve returning an error code to the control plane, so it could
+                            // figure out that this is a non-retryable error, return it to the user
+                            // and fail operation permanently.
                            let drop_db_query: String = format!(
                                "DROP DATABASE IF EXISTS {} WITH (FORCE)",
                                &op.name.pg_quote()
@@ -444,30 +444,6 @@ async fn get_operations<'a>(
        }
        ApplySpecPhase::RunInEachDatabase { db, subphase } => {
            match subphase {
-                PerDatabasePhase::DropSubscriptionsForDeletedDatabases => {
-                    match &db {
-                        DB::UserDB(db) => {
-                            let drop_subscription_query: String = format!(
-                                include_str!("sql/drop_subscription_for_drop_dbs.sql"),
-                                datname_str = escape_literal(&db.name),
-                            );
-
-                            let operations = vec![Operation {
-                                query: drop_subscription_query,
-                                comment: Some(format!(
-                                    "optionally dropping subscriptions for DB {}",
-                                    db.name,
-                                )),
-                            }]
-                            .into_iter();
-
-                            Ok(Box::new(operations))
-                        }
-                        // skip this cleanup for the system databases
-                        // because users can't drop them
-                        DB::SystemDB => Ok(Box::new(empty())),
-                    }
-                }
                PerDatabasePhase::DeleteDBRoleReferences => {
                    let ctx = ctx.read().await;

@@ -498,19 +474,7 @@ async fn get_operations<'a>(
                                        ),
                                        comment: None,
                                    },
-                                    // Revoke some potentially blocking privileges (Neon-specific currently)
-                                    Operation {
-                                        query: format!(
-                                            include_str!("sql/pre_drop_role_revoke_privileges.sql"),
-                                            role_name = quoted,
-                                        ),
-                                        comment: None,
-                                    },
                                    // This now will only drop privileges of the role
-                                    // TODO: this is obviously not 100% true because of the above case,
-                                    // there could be still some privileges that are not revoked. Maybe this
-                                    // only drops privileges that were granted *by this* role, not *to this* role,
-                                    // but this has to be checked.
                                    Operation {
                                        query: format!("DROP OWNED BY {}", quoted),
                                        comment: None,
--- a/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql
+++ b/compute_tools/src/sql/drop_subscription_for_drop_dbs.sql
@@ -1,11 +0,0 @@
-DO $$
-DECLARE
-    subname TEXT;
-BEGIN
-    FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
-        EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
-        EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
-        EXECUTE format('DROP SUBSCRIPTION %I;', subname);
-    END LOOP;
-END;
-$$;
--- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
+++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
@@ -1,28 +0,0 @@
-SET SESSION ROLE neon_superuser;
-
-DO $$
-DECLARE
-    schema TEXT;
-    revoke_query TEXT;
-BEGIN
-    FOR schema IN
-        SELECT schema_name
-        FROM information_schema.schemata
-        -- So far, we only had issues with 'public' schema. Probably, because we do some additional grants,
-        -- e.g., make DB owner the owner of 'public' schema automatically (when created via API).
-        -- See https://github.com/neondatabase/cloud/issues/13582 for the context.
-        -- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema,
-        -- ii) it's easy to add more schemas to the list if needed.
-        WHERE schema_name IN ('public')
-    LOOP
-        revoke_query := format(
-            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
-            schema
-        );
-
-        EXECUTE revoke_query;
-    END LOOP;
-END;
-$$;
-
-RESET ROLE;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -62,7 +62,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
 use crate::storage_controller::StorageController;

-use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
+use compute_api::responses::{ComputeState, ComputeStatus};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
@@ -585,7 +585,6 @@ impl Endpoint {
            features: self.features.clone(),
            swap_size_bytes: None,
            disk_quota_bytes: None,
-            disable_lfc_resizing: None,
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
@@ -739,7 +738,7 @@ impl Endpoint {
    }

    // Call the /status HTTP API
-    pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
+    pub async fn get_status(&self) -> Result<ComputeState> {
        let client = reqwest::Client::new();

        let response = client
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -483,6 +483,7 @@ impl LocalEnv {
            .iter()
            .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
            .map(|&(_, timeline_id)| timeline_id)
+            .map(TimelineId::from)
    }

    pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -822,7 +822,10 @@ impl StorageController {
        self.dispatch(
            Method::PUT,
            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest { node_id }),
+            Some(TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id,
+            }),
        )
        .await
    }
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,16 +1,12 @@
 use futures::StreamExt;
-use std::{
-    collections::{HashMap, HashSet},
-    str::FromStr,
-    time::Duration,
-};
+use std::{str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
        AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
+        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -116,13 +112,6 @@ enum Command {
        #[arg(long)]
        node: NodeId,
    },
-    /// Migrate the secondary location for a tenant shard to a specific pageserver.
-    TenantShardMigrateSecondary {
-        #[arg(long)]
-        tenant_shard_id: TenantShardId,
-        #[arg(long)]
-        node: NodeId,
-    },
    /// Cancel any ongoing reconciliation for this shard
    TenantShardCancelReconcile {
        #[arg(long)]
@@ -157,12 +146,6 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
-    TenantSetPreferredAz {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        preferred_az: Option<String>,
-    },
    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
    TenantDrop {
@@ -412,12 +395,11 @@ async fn main() -> anyhow::Result<()> {
            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));

            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
+            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
                table.add_row([
                    format!("{}", node.id),
                    node.listen_http_addr,
-                    node.availability_zone_id,
                    format!("{:?}", node.scheduling),
                    format!("{:?}", node.availability),
                ]);
@@ -490,7 +472,6 @@ async fn main() -> anyhow::Result<()> {
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
-                "Preferred AZ",
                "ShardCount",
                "StripeSize",
                "Placement",
@@ -500,11 +481,6 @@ async fn main() -> anyhow::Result<()> {
                let shard_zero = tenant.shards.into_iter().next().unwrap();
                table.add_row([
                    format!("{}", tenant.tenant_id),
-                    shard_zero
-                        .preferred_az_id
-                        .as_ref()
-                        .cloned()
-                        .unwrap_or("".to_string()),
                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
                    format!("{:?}", tenant.stripe_size),
                    format!("{:?}", tenant.policy),
@@ -564,7 +540,10 @@ async fn main() -> anyhow::Result<()> {
            tenant_shard_id,
            node,
        } => {
-            let req = TenantShardMigrateRequest { node_id: node };
+            let req = TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id: node,
+            };

            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -574,20 +553,6 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
-        Command::TenantShardMigrateSecondary {
-            tenant_shard_id,
-            node,
-        } => {
-            let req = TenantShardMigrateRequest { node_id: node };
-
-            storcon_client
-                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
-                    Some(req),
-                )
-                .await?;
-        }
        Command::TenantShardCancelReconcile { tenant_shard_id } => {
            storcon_client
                .dispatch::<(), ()>(
@@ -631,19 +596,6 @@ async fn main() -> anyhow::Result<()> {
                    None,
                )
                .await?;
-
-            let nodes = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-            let nodes = nodes
-                .into_iter()
-                .map(|n| (n.id, n))
-                .collect::<HashMap<_, _>>();
-
            println!("Tenant {tenant_id}");
            let mut table = comfy_table::Table::new();
            table.add_row(["Policy", &format!("{:?}", policy)]);
@@ -652,14 +604,7 @@ async fn main() -> anyhow::Result<()> {
            println!("{table}");
            println!("Shards:");
            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "Shard",
-                "Attached",
-                "Attached AZ",
-                "Secondary",
-                "Last error",
-                "status",
-            ]);
+            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
            for shard in shards {
                let secondary = shard
                    .node_secondary
@@ -682,18 +627,11 @@ async fn main() -> anyhow::Result<()> {
                }
                let status = status_parts.join(",");

-                let attached_node = shard
-                    .node_attached
-                    .as_ref()
-                    .map(|id| nodes.get(id).expect("Shard references nonexistent node"));
-
                table.add_row([
                    format!("{}", shard.tenant_shard_id),
-                    attached_node
-                        .map(|n| format!("{} ({})", n.listen_http_addr, n.id))
-                        .unwrap_or(String::new()),
-                    attached_node
-                        .map(|n| n.availability_zone_id.clone())
+                    shard
+                        .node_attached
+                        .map(|n| format!("{}", n))
                        .unwrap_or(String::new()),
                    secondary,
                    shard.last_error,
@@ -702,66 +640,6 @@ async fn main() -> anyhow::Result<()> {
            }
            println!("{table}");
        }
-        Command::TenantSetPreferredAz {
-            tenant_id,
-            preferred_az,
-        } => {
-            // First learn about the tenant's shards
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-
-            // Learn about nodes to validate the AZ ID
-            let nodes = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            if let Some(preferred_az) = &preferred_az {
-                let azs = nodes
-                    .into_iter()
-                    .map(|n| (n.availability_zone_id))
-                    .collect::<HashSet<_>>();
-                if !azs.contains(preferred_az) {
-                    anyhow::bail!(
-                        "AZ {} not found on any node: known AZs are: {:?}",
-                        preferred_az,
-                        azs
-                    );
-                }
-            } else {
-                // Make it obvious to the user that since they've omitted an AZ, we're clearing it
-                eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
-            }
-
-            // Construct a request that modifies all the tenant's shards
-            let req = ShardsPreferredAzsRequest {
-                preferred_az_ids: describe_response
-                    .shards
-                    .into_iter()
-                    .map(|s| {
-                        (
-                            s.tenant_shard_id,
-                            preferred_az.clone().map(AvailabilityZone),
-                        )
-                    })
-                    .collect(),
-            };
-            storcon_client
-                .dispatch::<ShardsPreferredAzsRequest, ()>(
-                    Method::PUT,
-                    "control/v1/preferred_azs".to_string(),
-                    Some(req),
-                )
-                .await?;
-        }
        Command::TenantWarmup { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
@@ -1037,7 +915,10 @@ async fn main() -> anyhow::Result<()> {
                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                Method::PUT,
                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest { node_id: mv.to }),
+                                Some(TenantShardMigrateRequest {
+                                    tenant_shard_id: mv.tenant_shard_id,
+                                    node_id: mv.to,
+                                }),
                            )
                            .await
                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
@@ -1154,15 +1035,7 @@ async fn main() -> anyhow::Result<()> {
            resp.sort_by(|a, b| a.id.cmp(&b.id));

            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "Id",
-                "Version",
-                "Host",
-                "Port",
-                "Http Port",
-                "AZ Id",
-                "Scheduling",
-            ]);
+            table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
            for sk in resp {
                table.add_row([
                    format!("{}", sk.id),
@@ -1170,8 +1043,7 @@ async fn main() -> anyhow::Result<()> {
                    sk.host,
                    format!("{}", sk.port),
                    format!("{}", sk.http_port),
-                    sk.availability_zone_id.clone(),
-                    String::from(sk.scheduling_policy),
+                    sk.availability_zone_id.to_string(),
                ]);
            }
            println!("{table}");
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,96 @@
+version: '3.8'
+
+x-build-args-bullseye: &build-args-bullseye
+  DEBIAN_VERSION: bullseye
+  GIT_VERSION: local # seems to be not used in compute node though
+  BUILD_TAG: ${BUILD_TAG:-local}
+
+x-build-args-bookworm: &build-args-bookworm
+  DEBIAN_VERSION: bookworm
+  GIT_VERSION: local # seems to be not used in compute node though
+  BUILD_TAG: ${BUILD_TAG:-local}
+
+services:
+  compute-node-v14: &compute-node-v14-base
+    image: neondatabase/compute-node-v14:${IMAGE_TAG:-local}
+    build:
+      context: .
+      dockerfile: compute/compute-node.Dockerfile
+      args:
+        <<: *build-args-bullseye
+        PG_VERSION: v14
+      cache_from:
+        - neondatabase/compute-node-v14:${CACHE_FROM_TAG:-latest}
+
+  compute-node-v14-amd64:
+    <<: *compute-node-v14-base
+    platform: linux/amd64
+    image: neondatabase/compute-node-v14:${IMAGE_TAG:-local}-amd64
+
+  compute-node-v14-arm64:
+    <<: *compute-node-v14-base
+    platform: linux/arm64
+    image: neondatabase/compute-node-v14:${IMAGE_TAG:-local}-arm64
+
+  compute-node-v15: &compute-node-v15-base
+    image: neondatabase/compute-node-v15:${IMAGE_TAG:-local}
+    build:
+      context: .
+      dockerfile: compute/compute-node.Dockerfile
+      args:
+        <<: *build-args-bullseye
+        PG_VERSION: v15
+      cache_from:
+        - neondatabase/compute-node-v15:${CACHE_FROM_TAG:-latest}
+
+  compute-node-v15-amd64:
+    <<: *compute-node-v15-base
+    platform: linux/amd64
+    image: neondatabase/compute-node-v15:${IMAGE_TAG:-local}-amd64
+
+  compute-node-v15-arm64:
+    <<: *compute-node-v15-base
+    platform: linux/arm64
+    image: neondatabase/compute-node-v15:${IMAGE_TAG:-local}-arm64
+
+  compute-node-v16: &compute-node-v16-base
+    image: neondatabase/compute-node-v16:${IMAGE_TAG:-local}
+    build:
+      context: .
+      dockerfile: compute/compute-node.Dockerfile
+      args:
+        <<: *build-args-bullseye
+        PG_VERSION: v16
+      cache_from:
+        - neondatabase/compute-node-v16:${CACHE_FROM_TAG:-latest}
+
+  compute-node-v16-amd64:
+    <<: *compute-node-v16-base
+    platform: linux/amd64
+    image: neondatabase/compute-node-v16:${IMAGE_TAG:-local}-amd64
+
+  compute-node-v16-arm64:
+    <<: *compute-node-v16-base
+    platform: linux/arm64
+    image: neondatabase/compute-node-v16:${IMAGE_TAG:-local}-arm64
+
+  compute-node-v17: &compute-node-v17-base
+    image: neondatabase/compute-node-v17:${IMAGE_TAG:-local}
+    build:
+      context: .
+      dockerfile: compute/compute-node.Dockerfile
+      args:
+        <<: *build-args-bookworm
+        PG_VERSION: v17
+      cache_from:
+        - neondatabase/compute-node-v17:${CACHE_FROM_TAG:-latest}
+
+  compute-node-v17-amd64:
+    <<: *compute-node-v17-base
+    platform: linux/amd64
+    image: neondatabase/compute-node-v17:${IMAGE_TAG:-local}-amd64
+
+  compute-node-v17-arm64:
+    <<: *compute-node-v17-base
+    platform: linux/arm64
+    image: neondatabase/compute-node-v17:${IMAGE_TAG:-local}-arm64
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,11 +7,15 @@ Currently we build two main images:
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).

+And additional intermediate image:
+
+- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
+
 ## Build pipeline

 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)
+1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)

 2. `neondatabase/neon`

--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -15,17 +15,6 @@ pub struct GenericAPIError {
    pub error: String,
 }

-#[derive(Debug, Clone, Serialize)]
-pub struct InfoResponse {
-    pub num_cpus: usize,
-}
-
-#[derive(Debug, Clone, Serialize)]
-pub struct ExtensionInstallResponse {
-    pub extension: PgIdent,
-    pub version: ExtVersion,
-}
-
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
@@ -39,6 +28,16 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

+#[derive(Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeState {
+    pub status: ComputeStatus,
+    /// Timestamp of the last Postgres activity
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: Option<DateTime<Utc>>,
+    pub error: Option<String>,
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -79,7 +78,7 @@ impl Display for ComputeStatus {
    }
 }

-pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
+fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
 where
    S: Serializer,
 {
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -67,15 +67,6 @@ pub struct ComputeSpec {
    #[serde(default)]
    pub disk_quota_bytes: Option<u64>,

-    /// Disables the vm-monitor behavior that resizes LFC on upscale/downscale, instead relying on
-    /// the initial size of LFC.
-    ///
-    /// This is intended for use when the LFC size is being overridden from the default but
-    /// autoscaling is still enabled, and we don't want the vm-monitor to interfere with the custom
-    /// LFC sizing.
-    #[serde(default)]
-    pub disable_lfc_resizing: Option<bool>,
-
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
 #[derive(Serialize, Deserialize)]
 pub struct ShardsPreferredAzsRequest {
    #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
+    pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
 }

 #[derive(Serialize, Deserialize)]
@@ -144,8 +144,6 @@ pub struct NodeDescribeResponse {
    pub availability: NodeAvailabilityWrapper,
    pub scheduling: NodeSchedulingPolicy,

-    pub availability_zone_id: String,
-
    pub listen_http_addr: String,
    pub listen_http_port: u16,

@@ -181,6 +179,7 @@ pub struct TenantDescribeResponseShard {
 /// specifies some constraints, e.g. asking it to get off particular node(s)
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
    pub node_id: NodeId,
 }

@@ -321,38 +320,6 @@ impl From<NodeSchedulingPolicy> for String {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
-pub enum SkSchedulingPolicy {
-    Active,
-    Disabled,
-    Decomissioned,
-}
-
-impl FromStr for SkSchedulingPolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        Ok(match s {
-            "active" => Self::Active,
-            "disabled" => Self::Disabled,
-            "decomissioned" => Self::Decomissioned,
-            _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
-        })
-    }
-}
-
-impl From<SkSchedulingPolicy> for String {
-    fn from(value: SkSchedulingPolicy) -> String {
-        use SkSchedulingPolicy::*;
-        match value {
-            Active => "active",
-            Disabled => "disabled",
-            Decomissioned => "decomissioned",
-        }
-        .to_string()
-    }
-}
-
 /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
@@ -369,16 +336,6 @@ pub enum PlacementPolicy {
    Detached,
 }

-impl PlacementPolicy {
-    pub fn want_secondaries(&self) -> usize {
-        match self {
-            PlacementPolicy::Attached(secondary_count) => *secondary_count,
-            PlacementPolicy::Secondary => 1,
-            PlacementPolicy::Detached => 0,
-        }
-    }
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

@@ -430,7 +387,6 @@ pub struct SafekeeperDescribeResponse {
    pub port: i32,
    pub http_port: i32,
    pub availability_zone_id: String,
-    pub scheduling_policy: SkSchedulingPolicy,
 }

 #[cfg(test)]
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -706,7 +706,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
 /// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
+pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();

 impl Key {
    // AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,42 +714,7 @@ impl Key {
    // switch (and generally it likely should be optional), so ignore these.
    #[inline(always)]
    pub fn is_inherited_key(self) -> bool {
-        if self.is_sparse() {
-            self.is_inherited_sparse_key()
-        } else {
-            !NON_INHERITED_RANGE.contains(&self)
-        }
-    }
-
-    #[inline(always)]
-    pub fn is_sparse(self) -> bool {
-        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
-    }
-
-    /// Check if the key belongs to the inherited keyspace.
-    fn is_inherited_sparse_key(self) -> bool {
-        debug_assert!(self.is_sparse());
-        self.field1 == RELATION_SIZE_PREFIX
-    }
-
-    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
-        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
-        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
-        Key {
-            field1: AUX_KEY_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: REPL_ORIGIN_KEY_PREFIX + 1,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
+        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
    }

    #[inline(always)]
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -272,8 +272,6 @@ pub struct CompactInfoResponse {
    pub compact_key_range: Option<CompactKeyRange>,
    pub compact_lsn_range: Option<CompactLsnRange>,
    pub sub_compaction: bool,
-    pub running: bool,
-    pub job_id: usize,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -1400,8 +1398,6 @@ pub enum PagestreamFeMessage {
    GetPage(PagestreamGetPageRequest),
    DbSize(PagestreamDbSizeRequest),
    GetSlruSegment(PagestreamGetSlruSegmentRequest),
-    #[cfg(feature = "testing")]
-    Test(PagestreamTestRequest),
 }

 // Wrapped in libpq CopyData
@@ -1413,8 +1409,6 @@ pub enum PagestreamBeMessage {
    Error(PagestreamErrorResponse),
    DbSize(PagestreamDbSizeResponse),
    GetSlruSegment(PagestreamGetSlruSegmentResponse),
-    #[cfg(feature = "testing")]
-    Test(PagestreamTestResponse),
 }

 // Keep in sync with `pagestore_client.h`
@@ -1426,9 +1420,6 @@ enum PagestreamBeMessageTag {
    Error = 103,
    DbSize = 104,
    GetSlruSegment = 105,
-    /// Test message discrimimant is unstable
-    #[cfg(feature = "testing")]
-    Test = 106,
 }
 impl TryFrom<u8> for PagestreamBeMessageTag {
    type Error = u8;
@@ -1440,8 +1431,6 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
            103 => Ok(PagestreamBeMessageTag::Error),
            104 => Ok(PagestreamBeMessageTag::DbSize),
            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
-            #[cfg(feature = "testing")]
-            106 => Ok(PagestreamBeMessageTag::Test),
            _ => Err(value),
        }
    }
@@ -1471,108 +1460,78 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // interface allows sending both LSNs, and let the pageserver do the right thing. There was no
 // difference in the responses between V1 and V2.
 //
-// V3 version of protocol adds request ID to all requests. This request ID is also included in response
-// as well as other fields from requests, which allows to verify that we receive response for our request.
-// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
-// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
-//
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Clone, Copy)]
 pub enum PagestreamProtocolVersion {
    V2,
-    V3,
 }

-pub type RequestId = u64;
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamRequest {
-    pub reqid: RequestId,
+#[derive(Debug, PartialEq, Eq)]
+pub struct PagestreamExistsRequest {
    pub request_lsn: Lsn,
    pub not_modified_since: Lsn,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamExistsRequest {
-    pub hdr: PagestreamRequest,
    pub rel: RelTag,
 }

-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub hdr: PagestreamRequest,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub rel: RelTag,
 }

-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub hdr: PagestreamRequest,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub rel: RelTag,
    pub blkno: u32,
 }

-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub hdr: PagestreamRequest,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub dbnode: u32,
 }

-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub hdr: PagestreamRequest,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
    pub kind: u8,
    pub segno: u32,
 }

 #[derive(Debug)]
 pub struct PagestreamExistsResponse {
-    pub req: PagestreamExistsRequest,
    pub exists: bool,
 }

 #[derive(Debug)]
 pub struct PagestreamNblocksResponse {
-    pub req: PagestreamNblocksRequest,
    pub n_blocks: u32,
 }

 #[derive(Debug)]
 pub struct PagestreamGetPageResponse {
-    pub req: PagestreamGetPageRequest,
    pub page: Bytes,
 }

 #[derive(Debug)]
 pub struct PagestreamGetSlruSegmentResponse {
-    pub req: PagestreamGetSlruSegmentRequest,
    pub segment: Bytes,
 }

 #[derive(Debug)]
 pub struct PagestreamErrorResponse {
-    pub req: PagestreamRequest,
    pub message: String,
 }

 #[derive(Debug)]
 pub struct PagestreamDbSizeResponse {
-    pub req: PagestreamDbSizeRequest,
    pub db_size: i64,
 }

-#[cfg(feature = "testing")]
-#[derive(Debug, PartialEq, Eq, Clone)]
-pub struct PagestreamTestRequest {
-    pub hdr: PagestreamRequest,
-    pub batch_key: u64,
-    pub message: String,
-}
-
-#[cfg(feature = "testing")]
-#[derive(Debug)]
-pub struct PagestreamTestResponse {
-    pub req: PagestreamTestRequest,
-}
-
 // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
 // that require pageserver-internal types.  It is sufficient to get the total size.
 #[derive(Serialize, Deserialize, Debug)]
@@ -1586,16 +1545,15 @@ pub struct TenantHistorySize {

 impl PagestreamFeMessage {
    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 3.
+    /// tools. Always uses protocol version 2.
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

        match self {
            Self::Exists(req) => {
                bytes.put_u8(0);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1604,9 +1562,8 @@ impl PagestreamFeMessage {

            Self::Nblocks(req) => {
                bytes.put_u8(1);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1615,9 +1572,8 @@ impl PagestreamFeMessage {

            Self::GetPage(req) => {
                bytes.put_u8(2);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1627,65 +1583,38 @@ impl PagestreamFeMessage {

            Self::DbSize(req) => {
                bytes.put_u8(3);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u32(req.dbnode);
            }

            Self::GetSlruSegment(req) => {
                bytes.put_u8(4);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.request_lsn.0);
+                bytes.put_u64(req.not_modified_since.0);
                bytes.put_u8(req.kind);
                bytes.put_u32(req.segno);
            }
-            #[cfg(feature = "testing")]
-            Self::Test(req) => {
-                bytes.put_u8(5);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u64(req.batch_key);
-                let message = req.message.as_bytes();
-                bytes.put_u64(message.len() as u64);
-                bytes.put_slice(message);
-            }
        }

        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;
-        let (reqid, request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                0,
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V3 => (
-                body.read_u64::<BigEndian>()?,
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-        };
+
+        // these two fields are the same for every request type
+        let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
+        let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);

        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
+                request_lsn,
+                not_modified_since,
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1694,11 +1623,8 @@ impl PagestreamFeMessage {
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
+                request_lsn,
+                not_modified_since,
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1707,11 +1633,8 @@ impl PagestreamFeMessage {
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
+                request_lsn,
+                not_modified_since,
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1721,174 +1644,61 @@ impl PagestreamFeMessage {
                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
+                request_lsn,
+                not_modified_since,
                dbnode: body.read_u32::<BigEndian>()?,
            })),
            4 => Ok(PagestreamFeMessage::GetSlruSegment(
                PagestreamGetSlruSegmentRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
+                    request_lsn,
+                    not_modified_since,
                    kind: body.read_u8()?,
                    segno: body.read_u32::<BigEndian>()?,
                },
            )),
-            #[cfg(feature = "testing")]
-            5 => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                batch_key: body.read_u64::<BigEndian>()?,
-                message: {
-                    let len = body.read_u64::<BigEndian>()?;
-                    let mut buf = vec![0; len as usize];
-                    body.read_exact(&mut buf)?;
-                    String::from_utf8(buf)?
-                },
-            })),
            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
        }
    }
 }

 impl PagestreamBeMessage {
-    pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
+    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

        use PagestreamBeMessageTag as Tag;
-        match protocol_version {
-            PagestreamProtocolVersion::V2 => {
-                match self {
-                    Self::Exists(resp) => {
-                        bytes.put_u8(Tag::Exists as u8);
-                        bytes.put_u8(resp.exists as u8);
-                    }
-
-                    Self::Nblocks(resp) => {
-                        bytes.put_u8(Tag::Nblocks as u8);
-                        bytes.put_u32(resp.n_blocks);
-                    }
-
-                    Self::GetPage(resp) => {
-                        bytes.put_u8(Tag::GetPage as u8);
-                        bytes.put(&resp.page[..])
-                    }
-
-                    Self::Error(resp) => {
-                        bytes.put_u8(Tag::Error as u8);
-                        bytes.put(resp.message.as_bytes());
-                        bytes.put_u8(0); // null terminator
-                    }
-                    Self::DbSize(resp) => {
-                        bytes.put_u8(Tag::DbSize as u8);
-                        bytes.put_i64(resp.db_size);
-                    }
-
-                    Self::GetSlruSegment(resp) => {
-                        bytes.put_u8(Tag::GetSlruSegment as u8);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                        bytes.put(&resp.segment[..]);
-                    }
-
-                    #[cfg(feature = "testing")]
-                    Self::Test(resp) => {
-                        bytes.put_u8(Tag::Test as u8);
-                        bytes.put_u64(resp.req.batch_key);
-                        let message = resp.req.message.as_bytes();
-                        bytes.put_u64(message.len() as u64);
-                        bytes.put_slice(message);
-                    }
-                }
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(Tag::Exists as u8);
+                bytes.put_u8(resp.exists as u8);
            }
-            PagestreamProtocolVersion::V3 => {
-                match self {
-                    Self::Exists(resp) => {
-                        bytes.put_u8(Tag::Exists as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u8(resp.exists as u8);
-                    }

-                    Self::Nblocks(resp) => {
-                        bytes.put_u8(Tag::Nblocks as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u32(resp.n_blocks);
-                    }
+            Self::Nblocks(resp) => {
+                bytes.put_u8(Tag::Nblocks as u8);
+                bytes.put_u32(resp.n_blocks);
+            }

-                    Self::GetPage(resp) => {
-                        bytes.put_u8(Tag::GetPage as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u32(resp.req.blkno);
-                        bytes.put(&resp.page[..])
-                    }
+            Self::GetPage(resp) => {
+                bytes.put_u8(Tag::GetPage as u8);
+                bytes.put(&resp.page[..]);
+            }

-                    Self::Error(resp) => {
-                        bytes.put_u8(Tag::Error as u8);
-                        bytes.put_u64(resp.req.reqid);
-                        bytes.put_u64(resp.req.request_lsn.0);
-                        bytes.put_u64(resp.req.not_modified_since.0);
-                        bytes.put(resp.message.as_bytes());
-                        bytes.put_u8(0); // null terminator
-                    }
-                    Self::DbSize(resp) => {
-                        bytes.put_u8(Tag::DbSize as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.dbnode);
-                        bytes.put_i64(resp.db_size);
-                    }
+            Self::Error(resp) => {
+                bytes.put_u8(Tag::Error as u8);
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(Tag::DbSize as u8);
+                bytes.put_i64(resp.db_size);
+            }

-                    Self::GetSlruSegment(resp) => {
-                        bytes.put_u8(Tag::GetSlruSegment as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u8(resp.req.kind);
-                        bytes.put_u32(resp.req.segno);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                        bytes.put(&resp.segment[..]);
-                    }
-
-                    #[cfg(feature = "testing")]
-                    Self::Test(resp) => {
-                        bytes.put_u8(Tag::Test as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u64(resp.req.batch_key);
-                        let message = resp.req.message.as_bytes();
-                        bytes.put_u64(message.len() as u64);
-                        bytes.put_slice(message);
-                    }
-                }
+            Self::GetSlruSegment(resp) => {
+                bytes.put_u8(Tag::GetSlruSegment as u8);
+                bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                bytes.put(&resp.segment[..]);
            }
        }
+
        bytes.into()
    }

@@ -1900,156 +1710,41 @@ impl PagestreamBeMessage {
        let ok =
            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
                Tag::Exists => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let exists = buf.read_u8()? != 0;
+                    let exists = buf.read_u8()?;
                    Self::Exists(PagestreamExistsResponse {
-                        req: PagestreamExistsRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                        },
-                        exists,
+                        exists: exists != 0,
                    })
                }
                Tag::Nblocks => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
                    let n_blocks = buf.read_u32::<BigEndian>()?;
-                    Self::Nblocks(PagestreamNblocksResponse {
-                        req: PagestreamNblocksRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                        },
-                        n_blocks,
-                    })
+                    Self::Nblocks(PagestreamNblocksResponse { n_blocks })
                }
                Tag::GetPage => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let blkno = buf.read_u32::<BigEndian>()?;
                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
                    buf.read_exact(&mut page)?;
-                    Self::GetPage(PagestreamGetPageResponse {
-                        req: PagestreamGetPageRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                            blkno,
-                        },
-                        page: page.into(),
-                    })
+                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
                }
                Tag::Error => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
                    let mut msg = Vec::new();
                    buf.read_until(0, &mut msg)?;
                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
                    let rust_str = cstring.to_str()?;
-                    Self::Error(PagestreamErrorResponse {
-                        req: PagestreamRequest {
-                            reqid,
-                            request_lsn,
-                            not_modified_since,
-                        },
+                    PagestreamBeMessage::Error(PagestreamErrorResponse {
                        message: rust_str.to_owned(),
                    })
                }
                Tag::DbSize => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let dbnode = buf.read_u32::<BigEndian>()?;
                    let db_size = buf.read_i64::<BigEndian>()?;
-                    Self::DbSize(PagestreamDbSizeResponse {
-                        req: PagestreamDbSizeRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            dbnode,
-                        },
-                        db_size,
-                    })
+                    Self::DbSize(PagestreamDbSizeResponse { db_size })
                }
                Tag::GetSlruSegment => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let kind = buf.read_u8()?;
-                    let segno = buf.read_u32::<BigEndian>()?;
                    let n_blocks = buf.read_u32::<BigEndian>()?;
                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
                    buf.read_exact(&mut segment)?;
                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
-                        req: PagestreamGetSlruSegmentRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            kind,
-                            segno,
-                        },
                        segment: segment.into(),
                    })
                }
-                #[cfg(feature = "testing")]
-                Tag::Test => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let batch_key = buf.read_u64::<BigEndian>()?;
-                    let len = buf.read_u64::<BigEndian>()?;
-                    let mut msg = vec![0; len as usize];
-                    buf.read_exact(&mut msg)?;
-                    let message = String::from_utf8(msg)?;
-                    Self::Test(PagestreamTestResponse {
-                        req: PagestreamTestRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            batch_key,
-                            message,
-                        },
-                    })
-                }
            };
        let remaining = buf.into_inner();
        if !remaining.is_empty() {
@@ -2069,8 +1764,6 @@ impl PagestreamBeMessage {
            Self::Error(_) => "Error",
            Self::DbSize(_) => "DbSize",
            Self::GetSlruSegment(_) => "GetSlruSegment",
-            #[cfg(feature = "testing")]
-            Self::Test(_) => "Test",
        }
    }
 }
@@ -2087,11 +1780,8 @@ mod tests {
        // Test serialization/deserialization of PagestreamFeMessage
        let messages = vec![
            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -2100,11 +1790,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(4),
-                },
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -2113,11 +1800,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -2127,19 +1811,14 @@ mod tests {
                blkno: 7,
            }),
            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
+                request_lsn: Lsn(4),
+                not_modified_since: Lsn(3),
                dbnode: 7,
            }),
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
-                    .unwrap();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
            assert!(msg == reconstructed);
        }
    }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);

 impl ProtocolVersion {
    pub const fn new(major: u16, minor: u16) -> Self {
-        Self(((major as u32) << 16) | minor as u32)
+        Self((major as u32) << 16 | minor as u32)
    }
    pub const fn minor(self) -> u16 {
        self.0 as u16
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -33,12 +33,8 @@ pub struct Response {
 #[derive(PartialEq, Debug)]
 enum State {
    Active,
-    Closing,
-}
-
-enum WriteReady {
    Terminating,
-    WaitingOnRead,
+    Closing,
 }

 /// A connection to a PostgreSQL database.
@@ -55,6 +51,7 @@ pub struct Connection<S, T> {
    /// HACK: we need this in the Neon Proxy to forward params.
    pub parameters: HashMap<String, String>,
    receiver: mpsc::UnboundedReceiver<Request>,
+    pending_request: Option<RequestMessages>,
    pending_responses: VecDeque<BackendMessage>,
    responses: VecDeque<Response>,
    state: State,
@@ -75,6 +72,7 @@ where
            stream,
            parameters,
            receiver,
+            pending_request: None,
            pending_responses,
            responses: VecDeque::new(),
            state: State::Active,
@@ -95,23 +93,26 @@ where
            .map(|o| o.map(|r| r.map_err(Error::io)))
    }

-    /// Read and process messages from the connection to postgres.
-    /// client <- postgres
-    fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll<Result<AsyncMessage, Error>> {
+    fn poll_read(&mut self, cx: &mut Context<'_>) -> Result<Option<AsyncMessage>, Error> {
+        if self.state != State::Active {
+            trace!("poll_read: done");
+            return Ok(None);
+        }
+
        loop {
            let message = match self.poll_response(cx)? {
                Poll::Ready(Some(message)) => message,
-                Poll::Ready(None) => return Poll::Ready(Err(Error::closed())),
+                Poll::Ready(None) => return Err(Error::closed()),
                Poll::Pending => {
                    trace!("poll_read: waiting on response");
-                    return Poll::Pending;
+                    return Ok(None);
                }
            };

            let (mut messages, request_complete) = match message {
                BackendMessage::Async(Message::NoticeResponse(body)) => {
                    let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
-                    return Poll::Ready(Ok(AsyncMessage::Notice(error)));
+                    return Ok(Some(AsyncMessage::Notice(error)));
                }
                BackendMessage::Async(Message::NotificationResponse(body)) => {
                    let notification = Notification {
@@ -119,7 +120,7 @@ where
                        channel: body.channel().map_err(Error::parse)?.to_string(),
                        payload: body.message().map_err(Error::parse)?.to_string(),
                    };
-                    return Poll::Ready(Ok(AsyncMessage::Notification(notification)));
+                    return Ok(Some(AsyncMessage::Notification(notification)));
                }
                BackendMessage::Async(Message::ParameterStatus(body)) => {
                    self.parameters.insert(
@@ -138,10 +139,8 @@ where
            let mut response = match self.responses.pop_front() {
                Some(response) => response,
                None => match messages.next().map_err(Error::parse)? {
-                    Some(Message::ErrorResponse(error)) => {
-                        return Poll::Ready(Err(Error::db(error)))
-                    }
-                    _ => return Poll::Ready(Err(Error::unexpected_message())),
+                    Some(Message::ErrorResponse(error)) => return Err(Error::db(error)),
+                    _ => return Err(Error::unexpected_message()),
                },
            };

@@ -165,14 +164,18 @@ where
                        request_complete,
                    });
                    trace!("poll_read: waiting on sender");
-                    return Poll::Pending;
+                    return Ok(None);
                }
            }
        }
    }

-    /// Fetch the next client request and enqueue the response sender.
    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<RequestMessages>> {
+        if let Some(messages) = self.pending_request.take() {
+            trace!("retrying pending request");
+            return Poll::Ready(Some(messages));
+        }
+
        if self.receiver.is_closed() {
            return Poll::Ready(None);
        }
@@ -190,80 +193,74 @@ where
        }
    }

-    /// Process client requests and write them to the postgres connection, flushing if necessary.
-    /// client -> postgres
-    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<WriteReady, Error>> {
+    fn poll_write(&mut self, cx: &mut Context<'_>) -> Result<bool, Error> {
        loop {
+            if self.state == State::Closing {
+                trace!("poll_write: done");
+                return Ok(false);
+            }
+
            if Pin::new(&mut self.stream)
                .poll_ready(cx)
                .map_err(Error::io)?
                .is_pending()
            {
                trace!("poll_write: waiting on socket");
-
-                // poll_ready is self-flushing.
-                return Poll::Pending;
+                return Ok(false);
            }

-            match self.poll_request(cx) {
-                // send the message to postgres
-                Poll::Ready(Some(RequestMessages::Single(request))) => {
-                    Pin::new(&mut self.stream)
-                        .start_send(request)
-                        .map_err(Error::io)?;
-                }
-                // No more messages from the client, and no more responses to wait for.
-                // Send a terminate message to postgres
-                Poll::Ready(None) if self.responses.is_empty() => {
+            let request = match self.poll_request(cx) {
+                Poll::Ready(Some(request)) => request,
+                Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => {
                    trace!("poll_write: at eof, terminating");
+                    self.state = State::Terminating;
                    let mut request = BytesMut::new();
                    frontend::terminate(&mut request);
-                    let request = FrontendMessage::Raw(request.freeze());
-
-                    Pin::new(&mut self.stream)
-                        .start_send(request)
-                        .map_err(Error::io)?;
-
-                    trace!("poll_write: sent eof, closing");
-                    trace!("poll_write: done");
-                    return Poll::Ready(Ok(WriteReady::Terminating));
+                    RequestMessages::Single(FrontendMessage::Raw(request.freeze()))
                }
-                // No more messages from the client, but there are still some responses to wait for.
                Poll::Ready(None) => {
                    trace!(
                        "poll_write: at eof, pending responses {}",
                        self.responses.len()
                    );
-                    ready!(self.poll_flush(cx))?;
-                    return Poll::Ready(Ok(WriteReady::WaitingOnRead));
+                    return Ok(true);
                }
-                // Still waiting for a message from the client.
                Poll::Pending => {
                    trace!("poll_write: waiting on request");
-                    ready!(self.poll_flush(cx))?;
-                    return Poll::Pending;
+                    return Ok(true);
+                }
+            };
+
+            match request {
+                RequestMessages::Single(request) => {
+                    Pin::new(&mut self.stream)
+                        .start_send(request)
+                        .map_err(Error::io)?;
+                    if self.state == State::Terminating {
+                        trace!("poll_write: sent eof, closing");
+                        self.state = State::Closing;
+                    }
                }
            }
        }
    }

-    fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
+    fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> {
        match Pin::new(&mut self.stream)
            .poll_flush(cx)
            .map_err(Error::io)?
        {
-            Poll::Ready(()) => {
-                trace!("poll_flush: flushed");
-                Poll::Ready(Ok(()))
-            }
-            Poll::Pending => {
-                trace!("poll_flush: waiting on socket");
-                Poll::Pending
-            }
+            Poll::Ready(()) => trace!("poll_flush: flushed"),
+            Poll::Pending => trace!("poll_flush: waiting on socket"),
        }
+        Ok(())
    }

    fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
+        if self.state != State::Closing {
+            return Poll::Pending;
+        }
+
        match Pin::new(&mut self.stream)
            .poll_close(cx)
            .map_err(Error::io)?
@@ -292,30 +289,18 @@ where
        &mut self,
        cx: &mut Context<'_>,
    ) -> Poll<Option<Result<AsyncMessage, Error>>> {
-        if self.state != State::Closing {
-            // if the state is still active, try read from and write to postgres.
-            let message = self.poll_read(cx)?;
-            let closing = self.poll_write(cx)?;
-            if let Poll::Ready(WriteReady::Terminating) = closing {
-                self.state = State::Closing;
-            }
-
-            if let Poll::Ready(message) = message {
-                return Poll::Ready(Some(Ok(message)));
-            }
-
-            // poll_read returned Pending.
-            // poll_write returned Pending or Ready(WriteReady::WaitingOnRead).
-            // if poll_write returned Ready(WriteReady::WaitingOnRead), then we are waiting to read more data from postgres.
-            if self.state != State::Closing {
-                return Poll::Pending;
-            }
+        let message = self.poll_read(cx)?;
+        let want_flush = self.poll_write(cx)?;
+        if want_flush {
+            self.poll_flush(cx)?;
        }
-
-        match self.poll_shutdown(cx) {
-            Poll::Ready(Ok(())) => Poll::Ready(None),
-            Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))),
-            Poll::Pending => Poll::Pending,
+        match message {
+            Some(message) => Poll::Ready(Some(Ok(message))),
+            None => match self.poll_shutdown(cx) {
+                Poll::Ready(Ok(())) => Poll::Ready(None),
+                Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))),
+                Poll::Pending => Poll::Pending,
+            },
        }
    }
 }
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -43,17 +43,6 @@ impl RemoteStorageKind {
    }
 }

-impl RemoteStorageConfig {
-    /// Helper to fetch the configured concurrency limit.
-    pub fn concurrency_limit(&self) -> Option<usize> {
-        match &self.storage {
-            RemoteStorageKind::LocalFs { .. } => None,
-            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
-            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
-        }
-    }
-}
-
 fn default_timeout() -> Duration {
    RemoteStorageConfig::DEFAULT_TIMEOUT
 }
@@ -126,15 +115,13 @@ fn default_max_keys_per_list_response() -> Option<i32> {
 }

 fn default_azure_conn_pool_size() -> usize {
-    // By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues
+    // Conservative default: no connection pooling.  At time of writing this is the Azure
+    // SDK's default as well, due to historic reports of hard-to-reproduce issues
    // (https://github.com/hyperium/hyper/issues/2312)
    //
    // However, using connection pooling is important to avoid exhausting client ports when
    // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
-    //
-    // We therefore enable a modest pool size by default: this may be configured to zero if
-    // issues like the alleged upstream hyper issue appear.
-    8
+    0
 }

 impl Debug for S3Config {
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -38,6 +38,7 @@ pub mod http;

 use opentelemetry::trace::TracerProvider;
 use opentelemetry::KeyValue;
+use opentelemetry_sdk::Resource;
 use tracing::Subscriber;
 use tracing_subscriber::registry::LookupSpan;
 use tracing_subscriber::Layer;
@@ -120,10 +121,7 @@ where
    S: Subscriber + for<'span> LookupSpan<'span>,
 {
    // Sets up exporter from the OTEL_EXPORTER_* environment variables.
-    let exporter = opentelemetry_otlp::SpanExporter::builder()
-        .with_http()
-        .build()
-        .expect("could not initialize opentelemetry exporter");
+    let exporter = opentelemetry_otlp::new_exporter().http();

    // TODO: opentelemetry::global::set_error_handler() with custom handler that
    //       bypasses default tracing layers, but logs regular looking log
@@ -134,13 +132,17 @@ where
        opentelemetry_sdk::propagation::TraceContextPropagator::new(),
    );

-    let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
-        .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
-        .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
-            opentelemetry_semantic_conventions::resource::SERVICE_NAME,
-            service_name,
-        )]))
-        .build()
+    let tracer = opentelemetry_otlp::new_pipeline()
+        .tracing()
+        .with_exporter(exporter)
+        .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
+            Resource::new(vec![KeyValue::new(
+                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+                service_name,
+            )]),
+        ))
+        .install_batch(opentelemetry_sdk::runtime::Tokio)
+        .expect("could not initialize opentelemetry exporter")
        .tracer("global");

    tracing_opentelemetry::layer().with_tracer(tracer)
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,7 +26,6 @@ git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
-inferno.workspace = true
 itertools.workspace = true
 fail.workspace = true
 futures = { workspace = true }
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::*;

-/// Declare a failpoint that can use to `pause` failpoint action.
+/// Declare a failpoint that can use the `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
 #[macro_export]
 macro_rules! pausable_failpoint {
@@ -181,7 +181,7 @@ pub async fn failpoints_handler(
 ) -> Result<Response<Body>, ApiError> {
    if !fail::has_failpoints() {
        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Cannot manage failpoints because neon was compiled without failpoints support"
+            "Cannot manage failpoints because storage was compiled without failpoints support"
        )));
    }

--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -112,9 +112,9 @@ impl Serialize for Generation {
            // We should never be asked to serialize a None. Structures
            // that include an optional generation should convert None to an
            // Option<Generation>::None
-            Err(serde::ser::Error::custom(format!(
-                "Tried to serialize invalid generation ({self:?})"
-            )))
+            Err(serde::ser::Error::custom(
+                "Tried to serialize invalid generation ({self})",
+            ))
        }
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::sync::{mpsc, Mutex, Notify};
+use tokio::sync::{mpsc, Mutex};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
@@ -350,53 +350,33 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
    };
    let seconds = match parse_query_param(&req, "seconds")? {
        None => 5,
-        Some(seconds @ 1..=60) => seconds,
-        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
+        Some(seconds @ 1..=30) => seconds,
+        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
    };
    let frequency_hz = match parse_query_param(&req, "frequency")? {
        None => 99,
        Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
        Some(frequency) => frequency,
    };
-    let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();
+
+    // Only allow one profiler at a time.
+    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+    let _lock = PROFILE_LOCK
+        .try_lock()
+        .map_err(|_| ApiError::Conflict("profiler already running".into()))?;

    // Take the profile.
-    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
-    static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
-
-    let report = {
-        // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
-        // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
-        // for a lock(), to avoid races where the notify isn't currently awaited.
-        let _lock = loop {
-            match PROFILE_LOCK.try_lock() {
-                Ok(lock) => break lock,
-                Err(_) if force => PROFILE_CANCEL.notify_waiters(),
-                Err(_) => {
-                    return Err(ApiError::Conflict(
-                        "profiler already running (use ?force=true to cancel it)".into(),
-                    ))
-                }
-            }
-            tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
-        };
-
+    let report = tokio::task::spawn_blocking(move || {
        let guard = ProfilerGuardBuilder::default()
            .frequency(frequency_hz)
            .blocklist(&["libc", "libgcc", "pthread", "vdso"])
-            .build()
-            .map_err(|err| ApiError::InternalServerError(err.into()))?;
-
-        tokio::select! {
-            _ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
-            _ = PROFILE_CANCEL.notified() => {},
-        };
-
-        guard
-            .report()
-            .build()
-            .map_err(|err| ApiError::InternalServerError(err.into()))?
-    };
+            .build()?;
+        std::thread::sleep(Duration::from_secs(seconds));
+        guard.report().build()
+    })
+    .await
+    .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+    .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;

    // Return the report in the requested format.
    match format {
@@ -437,7 +417,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
    enum Format {
        Jemalloc,
        Pprof,
-        Svg,
    }

    // Parameters.
@@ -445,24 +424,9 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
        None => Format::Pprof,
        Some("jemalloc") => Format::Jemalloc,
        Some("pprof") => Format::Pprof,
-        Some("svg") => Format::Svg,
        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
    };

-    // Functions and mappings to strip when symbolizing pprof profiles. If true,
-    // also remove child frames.
-    static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
-        vec![
-            (Regex::new("^__rust").unwrap(), false),
-            (Regex::new("^_start$").unwrap(), false),
-            (Regex::new("^irallocx_prof").unwrap(), true),
-            (Regex::new("^prof_alloc_prep").unwrap(), true),
-            (Regex::new("^std::rt::lang_start").unwrap(), false),
-            (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
-        ]
-    });
-    const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
-
    // Obtain profiler handle.
    let mut prof_ctl = jemalloc_pprof::PROF_CTL
        .as_ref()
@@ -500,9 +464,24 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                // Symbolize the profile.
                // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
                // serialization roundtrip.
+                static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
+                    // Functions to strip from profiles. If true, also remove child frames.
+                    vec![
+                        (Regex::new("^__rust").unwrap(), false),
+                        (Regex::new("^_start$").unwrap(), false),
+                        (Regex::new("^irallocx_prof").unwrap(), true),
+                        (Regex::new("^prof_alloc_prep").unwrap(), true),
+                        (Regex::new("^std::rt::lang_start").unwrap(), false),
+                        (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
+                    ]
+                });
                let profile = pprof::decode(&bytes)?;
                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
+                let profile = pprof::strip_locations(
+                    profile,
+                    &["libc", "libgcc", "pthread", "vdso"],
+                    &STRIP_FUNCTIONS,
+                );
                pprof::encode(&profile)
            })
            .await
@@ -515,27 +494,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                .body(Body::from(data))
                .map_err(|err| ApiError::InternalServerError(err.into()))
        }
-
-        Format::Svg => {
-            let body = tokio::task::spawn_blocking(move || {
-                let bytes = prof_ctl.dump_pprof()?;
-                let profile = pprof::decode(&bytes)?;
-                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
-                let mut opts = inferno::flamegraph::Options::default();
-                opts.title = "Heap inuse".to_string();
-                opts.count_name = "bytes".to_string();
-                pprof::flamegraph(profile, &mut opts)
-            })
-            .await
-            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-            .map_err(ApiError::InternalServerError)?;
-            Response::builder()
-                .status(200)
-                .header(CONTENT_TYPE, "image/svg+xml")
-                .body(Body::from(body))
-                .map_err(|err| ApiError::InternalServerError(err.into()))
-        }
    }
 }

--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
            let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
-            Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
+            Ok(Lsn((left_num as u64) << 32 | right_num as u64))
        } else {
            Err(LsnParseError)
        }
--- a/libs/utils/src/pprof.rs
+++ b/libs/utils/src/pprof.rs
@@ -1,9 +1,8 @@
-use anyhow::bail;
 use flate2::write::{GzDecoder, GzEncoder};
 use flate2::Compression;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pprof::protos::{Function, Line, Location, Message as _, Profile};
+use pprof::protos::{Function, Line, Message as _, Profile};
 use regex::Regex;

 use std::borrow::Cow;
@@ -189,59 +188,3 @@ pub fn strip_locations(

    profile
 }
-
-/// Generates an SVG flamegraph from a symbolized pprof profile.
-pub fn flamegraph(
-    profile: Profile,
-    opts: &mut inferno::flamegraph::Options,
-) -> anyhow::Result<Vec<u8>> {
-    if profile.mapping.iter().any(|m| !m.has_functions) {
-        bail!("profile not symbolized");
-    }
-
-    // Index locations, functions, and strings.
-    let locations: HashMap<u64, Location> =
-        profile.location.into_iter().map(|l| (l.id, l)).collect();
-    let functions: HashMap<u64, Function> =
-        profile.function.into_iter().map(|f| (f.id, f)).collect();
-    let strings = profile.string_table;
-
-    // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
-    // since inferno expects it bottom-up.
-    let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
-    for sample in profile.sample {
-        let mut stack = Vec::with_capacity(sample.location_id.len());
-        for location in sample.location_id.into_iter().rev() {
-            let Some(location) = locations.get(&location) else {
-                bail!("missing location {location}");
-            };
-            for line in location.line.iter().rev() {
-                let Some(function) = functions.get(&line.function_id) else {
-                    bail!("missing function {}", line.function_id);
-                };
-                let Some(name) = strings.get(function.name as usize) else {
-                    bail!("missing string {}", function.name);
-                };
-                stack.push(name.as_str());
-            }
-        }
-        let Some(&value) = sample.value.first() else {
-            bail!("missing value");
-        };
-        *stacks.entry(stack).or_default() += value;
-    }
-
-    // Construct stack lines for inferno.
-    let lines = stacks
-        .into_iter()
-        .map(|(stack, value)| (stack.into_iter().join(";"), value))
-        .map(|(stack, value)| format!("{stack} {value}"))
-        .sorted()
-        .collect_vec();
-
-    // Construct the flamegraph.
-    let mut bytes = Vec::new();
-    let lines = lines.iter().map(|line| line.as_str());
-    inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
-    Ok(bytes)
-}
--- a/libs/utils/src/sync/spsc_fold.rs
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -96,11 +96,7 @@ impl<T: Send> Sender<T> {
                    }
                }
                State::SenderWaitsForReceiverToConsume(_data) => {
-                    // SAFETY: send is single threaded due to `&mut self` requirement,
-                    // therefore register is not concurrent.
-                    unsafe {
-                        self.state.wake_sender.register(cx.waker());
-                    }
+                    // Really, we shouldn't be polled until receiver has consumed and wakes us.
                    Poll::Pending
                }
                State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
@@ -453,38 +449,4 @@ mod tests {
        let err = recv_task.await.unwrap().expect_err("should error");
        assert!(matches!(err, RecvError::SenderGone));
    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() {
-        let (mut sender, receiver) = channel();
-
-        let state = receiver.state.clone();
-
-        sender.send((), |_, _| unreachable!()).await.unwrap();
-
-        assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_)));
-
-        let unmergeable = sender.send((), |_, _| Err(()));
-        let mut unmergeable = std::pin::pin!(unmergeable);
-        tokio::select! {
-            _ = tokio::time::sleep(FOREVER) => {},
-            _ = &mut unmergeable => {
-                panic!("unmergeable should not complete");
-            },
-        }
-
-        assert!(matches!(
-            &*state.value.lock().unwrap(),
-            &State::SenderWaitsForReceiverToConsume(_)
-        ));
-
-        drop(receiver);
-
-        assert!(matches!(
-            &*state.value.lock().unwrap(),
-            &State::ReceiverGone
-        ));
-
-        unmergeable.await.unwrap_err();
-    }
 }
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -95,14 +95,6 @@ impl InterpretedWalRecord {
            && self.metadata_record.is_none()
            && matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
    }
-
-    /// Checks if the WAL record is observed (i.e. contains only metadata
-    /// for observed values)
-    pub fn is_observed(&self) -> bool {
-        self.batch.is_observed()
-            && self.metadata_record.is_none()
-            && matches!(self.flush_uncommitted, FlushUncommittedRecords::No)
-    }
 }

 /// The interpreted part of the Postgres WAL record which requires metadata
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -501,11 +501,6 @@ impl SerializedValueBatch {
        !self.has_data() && self.metadata.is_empty()
    }

-    /// Checks if the batch contains only observed values
-    pub fn is_observed(&self) -> bool {
-        !self.has_data() && !self.metadata.is_empty()
-    }
-
    /// Checks if the batch contains data
    ///
    /// Note that if this returns false, it may still contain observed values or
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
+testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]

 [dependencies]
 anyhow.workspace = true
@@ -44,7 +44,6 @@ postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 postgres_initdb.workspace = true
-pprof.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -109,11 +108,3 @@ harness = false
 [[bench]]
 name = "bench_ingest"
 harness = false
-
-[[bench]]
-name = "upload_queue"
-harness = false
-
-[[bin]]
-name = "test_helper_slow_client_reads"
-required-features = [ "testing" ]
--- a/pageserver/benches/upload_queue.rs
+++ b/pageserver/benches/upload_queue.rs
@@ -1,87 +0,0 @@
-//! Upload queue benchmarks.
-
-use std::str::FromStr as _;
-use std::sync::atomic::AtomicU32;
-use std::sync::Arc;
-
-use criterion::{criterion_group, criterion_main, Bencher, Criterion};
-use pageserver::tenant::metadata::TimelineMetadata;
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::upload_queue::{Delete, UploadOp, UploadQueue, UploadTask};
-use pageserver::tenant::IndexPart;
-use pprof::criterion::{Output, PProfProfiler};
-use utils::generation::Generation;
-use utils::shard::{ShardCount, ShardIndex, ShardNumber};
-
-// Register benchmarks with Criterion.
-criterion_group!(
-    name = benches;
-    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
-    targets = bench_upload_queue_next_ready,
-);
-criterion_main!(benches);
-
-/// Benchmarks the cost of UploadQueue::next_ready() with the given number of in-progress tasks
-/// (which is equivalent to tasks ahead of it in the queue). This has linear cost, and the upload
-/// queue as a whole is thus quadratic.
-///
-/// UploadOp::UploadLayer requires an entire tenant and timeline to construct, so we just test
-/// Delete and UploadMetadata instead. This is incidentally the most expensive case.
-fn bench_upload_queue_next_ready(c: &mut Criterion) {
-    let mut g = c.benchmark_group("upload_queue_next_ready");
-    for inprogress in [0, 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000] {
-        g.bench_function(format!("inprogress={inprogress}"), |b| {
-            run_bench(b, inprogress).unwrap()
-        });
-    }
-
-    fn run_bench(b: &mut Bencher, inprogress: usize) -> anyhow::Result<()> {
-        // Construct two layers. layer0 is in the indexes, layer1 will be deleted.
-        let layer0 = LayerName::from_str("000000000000000000000000000000000000-100000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
-        let layer1 = LayerName::from_str("100000000000000000000000000000000001-200000000000000000000000000000000000__00000000016B59D8-00000000016B5A51").expect("invalid name");
-
-        let metadata = LayerFileMetadata {
-            shard: ShardIndex::new(ShardNumber(1), ShardCount(2)),
-            generation: Generation::Valid(1),
-            file_size: 0,
-        };
-
-        // Construct the (initial and uploaded) index with layer0.
-        let mut index = IndexPart::empty(TimelineMetadata::example());
-        index.layer_metadata.insert(layer0, metadata.clone());
-
-        // Construct the queue.
-        let mut queue = UploadQueue::Uninitialized;
-        let queue = queue.initialize_with_current_remote_index_part(&index, 0)?;
-
-        // Populate inprogress_tasks with a bunch of layer1 deletions.
-        let delete = UploadOp::Delete(Delete {
-            layers: vec![(layer1, metadata)],
-        });
-
-        for task_id in 0..(inprogress as u64) {
-            queue.inprogress_tasks.insert(
-                task_id,
-                Arc::new(UploadTask {
-                    task_id,
-                    retries: AtomicU32::new(0),
-                    op: delete.clone(),
-                    coalesced_ops: Vec::new(),
-                }),
-            );
-        }
-
-        // Benchmark index upload scheduling.
-        let index_upload = UploadOp::UploadMetadata {
-            uploaded: Box::new(index),
-        };
-
-        b.iter(|| {
-            queue.queued_operations.push_front(index_upload.clone());
-            assert!(queue.next_ready().is_some());
-        });
-
-        Ok(())
-    }
-}
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -4,9 +4,6 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

-[features]
-testing = [ "pageserver_api/testing" ]
-
 [dependencies]
 pageserver_api.workspace = true
 thiserror.workspace = true
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -1,9 +1,6 @@
-use std::sync::{Arc, Mutex};
+use std::pin::Pin;

-use futures::{
-    stream::{SplitSink, SplitStream},
-    SinkExt, StreamExt,
-};
+use futures::SinkExt;
 use pageserver_api::{
    models::{
        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
@@ -13,6 +10,7 @@ use pageserver_api::{
 };
 use tokio::task::JoinHandle;
 use tokio_postgres::CopyOutStream;
+use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
 use utils::{
    id::{TenantId, TimelineId},
@@ -62,30 +60,17 @@ impl Client {
    ) -> anyhow::Result<PagestreamClient> {
        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
            .client
-            .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
            .await?;
-        let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
        let Client {
            cancel_on_client_drop,
            conn_task,
            client: _,
        } = self;
-        let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
-            ConnTaskRunning {
-                cancel_on_client_drop,
-                conn_task,
-            },
-        )));
        Ok(PagestreamClient {
-            sink: PagestreamSender {
-                shared: shared.clone(),
-                sink,
-            },
-            stream: PagestreamReceiver {
-                shared: shared.clone(),
-                stream,
-            },
-            shared,
+            copy_both: Box::pin(copy_both),
+            conn_task,
+            cancel_on_client_drop,
        })
    }

@@ -112,28 +97,7 @@ impl Client {

 /// Create using [`Client::pagestream`].
 pub struct PagestreamClient {
-    shared: Arc<Mutex<PagestreamShared>>,
-    sink: PagestreamSender,
-    stream: PagestreamReceiver,
-}
-
-pub struct PagestreamSender {
-    #[allow(dead_code)]
-    shared: Arc<Mutex<PagestreamShared>>,
-    sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
-}
-
-pub struct PagestreamReceiver {
-    #[allow(dead_code)]
-    shared: Arc<Mutex<PagestreamShared>>,
-    stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
-}
-
-enum PagestreamShared {
-    ConnTaskRunning(ConnTaskRunning),
-    ConnTaskCancelledJoinHandleReturnedOrDropped,
-}
-struct ConnTaskRunning {
+    copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
    cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
    conn_task: JoinHandle<()>,
 }
@@ -146,11 +110,11 @@ pub struct RelTagBlockNo {
 impl PagestreamClient {
    pub async fn shutdown(self) {
        let Self {
-            shared,
-            sink,
-            stream,
-        } = { self };
-        // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
+            copy_both,
+            cancel_on_client_drop: cancel_conn_task,
+            conn_task,
+        } = self;
+        // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
        // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
        // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
        //
@@ -167,77 +131,27 @@ impl PagestreamClient {
        //
        // NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
        // => https://github.com/neondatabase/neon/issues/6390
-        let ConnTaskRunning {
-            cancel_on_client_drop,
-            conn_task,
-        } = {
-            let mut guard = shared.lock().unwrap();
-            match std::mem::replace(
-                &mut *guard,
-                PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
-            ) {
-                PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
-                PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
-            }
-        };
-        let _ = cancel_on_client_drop.unwrap();
+        let _ = cancel_conn_task.unwrap();
        conn_task.await.unwrap();
-
-        // Now drop the split copy_both.
-        drop(sink);
-        drop(stream);
-    }
-
-    pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
-        let Self {
-            shared: _,
-            sink,
-            stream,
-        } = self;
-        (sink, stream)
+        drop(copy_both);
    }

    pub async fn getpage(
        &mut self,
        req: PagestreamGetPageRequest,
    ) -> anyhow::Result<PagestreamGetPageResponse> {
-        self.getpage_send(req).await?;
-        self.getpage_recv().await
-    }
+        let req = PagestreamFeMessage::GetPage(req);
+        let req: bytes::Bytes = req.serialize();
+        // let mut req = tokio_util::io::ReaderStream::new(&req);
+        let mut req = tokio_stream::once(Ok(req));

-    pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
-        self.sink.getpage_send(req).await
-    }
+        self.copy_both.send_all(&mut req).await?;

-    pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
-        self.stream.getpage_recv().await
-    }
-}
-
-impl PagestreamSender {
-    // TODO: maybe make this impl Sink instead for better composability?
-    pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
-        let msg = msg.serialize();
-        self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
-        Ok(())
-    }
-
-    pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
-        self.send(PagestreamFeMessage::GetPage(req)).await
-    }
-}
-
-impl PagestreamReceiver {
-    // TODO: maybe make this impl Stream instead for better composability?
-    pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
-        let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
+        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
        let next: bytes::Bytes = next.unwrap()?;
-        PagestreamBeMessage::deserialize(next)
-    }

-    pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
-        let next: PagestreamBeMessage = self.recv().await?;
-        match next {
+        let msg = PagestreamBeMessage::deserialize(next)?;
+        match msg {
            PagestreamBeMessage::GetPage(p) => Ok(p),
            PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
            PagestreamBeMessage::Exists(_)
@@ -246,14 +160,7 @@ impl PagestreamReceiver {
            | PagestreamBeMessage::GetSlruSegment(_) => {
                anyhow::bail!(
                    "unexpected be message kind in response to getpage request: {}",
-                    next.kind()
-                )
-            }
-            #[cfg(feature = "testing")]
-            PagestreamBeMessage::Test(_) => {
-                anyhow::bail!(
-                    "unexpected be message kind in response to getpage request: {}",
-                    next.kind()
+                    msg.kind()
                )
            }
        }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -2,7 +2,7 @@ use anyhow::Context;
 use camino::Utf8PathBuf;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
-use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
+use pageserver_api::models::PagestreamGetPageRequest;

 use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
@@ -322,15 +322,12 @@ async fn main_impl(
                        .to_rel_block()
                        .expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
-                        hdr: PagestreamRequest {
-                            reqid: 0,
-                            request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                                Lsn::MAX
-                            } else {
-                                r.timeline_lsn
-                            },
-                            not_modified_since: r.timeline_lsn,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
                        },
+                        not_modified_since: r.timeline_lsn,
                        rel: rel_tag,
                        blkno: block_no,
                    }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,12 +53,10 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

-/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
-/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
-/// performance-sensitive code will avoid allocations as far as possible anyway.
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";

 const PID_FILE_NAME: &str = "pageserver.pid";

--- a/pageserver/src/bin/test_helper_slow_client_reads.rs
+++ b/pageserver/src/bin/test_helper_slow_client_reads.rs
@@ -1,65 +0,0 @@
-use std::{
-    io::{stdin, stdout, Read, Write},
-    time::Duration,
-};
-
-use clap::Parser;
-use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-#[derive(clap::Parser)]
-struct Args {
-    connstr: String,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let Args {
-        connstr,
-        tenant_id,
-        timeline_id,
-    } = Args::parse();
-    let client = pageserver_client::page_service::Client::new(connstr).await?;
-    let client = client.pagestream(tenant_id, timeline_id).await?;
-    let (mut sender, _receiver) = client.split();
-
-    eprintln!("filling the pipe");
-    let mut msg = 0;
-    loop {
-        msg += 1;
-        let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
-            PagestreamTestRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(23),
-                    not_modified_since: Lsn(23),
-                },
-                batch_key: 42,
-                message: format!("message {}", msg),
-            },
-        ));
-        let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
-            eprintln!("pipe seems full");
-            break;
-        };
-        let _: () = res?;
-    }
-
-    let n = stdout().write(b"R")?;
-    assert_eq!(n, 1);
-    stdout().flush()?;
-
-    eprintln!("waiting for signal to tell us to exit");
-
-    let mut buf = [0u8; 1];
-    stdin().read_exact(&mut buf)?;
-
-    eprintln!("termination signal received, exiting");
-
-    anyhow::Ok(())
-}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
-    TimelineInfo,
+    CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
+    TimelineGcRequest, TimelineInfo,
 };
 use utils::{
    auth::SwappableJwtAuth,
@@ -2052,7 +2052,15 @@ async fn timeline_compact_info_handler(
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
-        let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
+        let res = tenant.get_scheduled_compaction_tasks(timeline_id);
+        let mut resp = Vec::new();
+        for item in res {
+            resp.push(CompactInfoResponse {
+                compact_key_range: item.compact_key_range,
+                compact_lsn_range: item.compact_lsn_range,
+                sub_compaction: item.sub_compaction,
+            });
+        }
        json_response(StatusCode::OK, resp)
    }
    .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3,7 +3,7 @@ use metrics::{
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
-    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
@@ -91,6 +91,15 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_visited_per_read_global",
+        "Number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_layers_visited_per_vectored_read_global",
@@ -436,15 +445,6 @@ pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_flush_wait_upload_seconds",
-        "Time spent waiting for preceding uploads during layer flush",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
@@ -1224,189 +1224,117 @@ pub(crate) struct SmgrOpTimerInner {
    global_flush_in_progress_micros: IntCounter,
    per_timeline_flush_in_progress_micros: IntCounter,

-    throttling: Arc<tenant_throttling::Pagestream>,
-
    timings: SmgrOpTimerState,
 }

-/// The stages of request processing are represented by the enum variants.
-/// Used as part of [`SmgrOpTimerInner::timings`].
-///
-/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
-/// transition points.
-/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
-/// to the next state.
-///
-/// Each request goes through every stage, in all configurations.
-///
 #[derive(Debug)]
 enum SmgrOpTimerState {
    Received {
-        // In the future, we may want to track the full time the request spent
-        // inside pageserver process (time spent in kernel buffers can't be tracked).
-        // `received_at` would be used for that.
-        #[allow(dead_code)]
        received_at: Instant,
    },
-    Throttling {
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
        throttle_started_at: Instant,
+        started_execution_at: Instant,
    },
-    Batching {
-        throttle_done_at: Instant,
-    },
-    Executing {
-        execution_started_at: Instant,
-    },
-    Flushing,
-    // NB: when adding observation points, remember to update the Drop impl.
 }

-// NB: when adding observation points, remember to update the Drop impl.
-impl SmgrOpTimer {
-    /// See [`SmgrOpTimerState`] for more context.
-    pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
-        let Some(inner) = self.0.as_mut() else {
-            return;
-        };
-        let SmgrOpTimerState::Received { received_at: _ } = &mut inner.timings else {
-            return;
-        };
-        inner.throttling.count_accounted_start.inc();
-        inner.timings = SmgrOpTimerState::Throttling {
-            throttle_started_at: at,
-        };
-    }
-
-    /// See [`SmgrOpTimerState`] for more context.
-    pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
-        let Some(inner) = self.0.as_mut() else {
-            return;
-        };
-        let SmgrOpTimerState::Throttling {
-            throttle_started_at,
-        } = &inner.timings
-        else {
-            return;
-        };
-        inner.throttling.count_accounted_finish.inc();
-        match throttle {
-            ThrottleResult::NotThrottled { end } => {
-                inner.timings = SmgrOpTimerState::Batching {
-                    throttle_done_at: end,
-                };
-            }
-            ThrottleResult::Throttled { end } => {
-                // update metrics
-                inner.throttling.count_throttled.inc();
-                inner
-                    .throttling
-                    .wait_time
-                    .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
-                // state transition
-                inner.timings = SmgrOpTimerState::Batching {
-                    throttle_done_at: end,
-                };
-            }
-        }
-    }
-
-    /// See [`SmgrOpTimerState`] for more context.
-    pub(crate) fn observe_execution_start(&mut self, at: Instant) {
-        let Some(inner) = self.0.as_mut() else {
-            return;
-        };
-        let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
-            return;
-        };
-        // update metrics
-        let batch = at - *throttle_done_at;
-        inner.global_batch_wait_time.observe(batch.as_secs_f64());
-        inner
-            .per_timeline_batch_wait_time
-            .observe(batch.as_secs_f64());
-        // state transition
-        inner.timings = SmgrOpTimerState::Executing {
-            execution_started_at: at,
-        }
-    }
-
-    /// For all but the first caller, this is a no-op.
-    /// The first callers receives Some, subsequent ones None.
-    ///
-    /// See [`SmgrOpTimerState`] for more context.
-    pub(crate) fn observe_execution_end_flush_start(
-        &mut self,
-        at: Instant,
-    ) -> Option<SmgrOpFlushInProgress> {
-        // NB: unlike the other observe_* methods, this one take()s.
-        #[allow(clippy::question_mark)] // maintain similar code pattern.
-        let Some(mut inner) = self.0.take() else {
-            return None;
-        };
-        let SmgrOpTimerState::Executing {
-            execution_started_at,
-        } = &inner.timings
-        else {
-            return None;
-        };
-        // update metrics
-        let execution = at - *execution_started_at;
-        inner
-            .global_execution_latency_histo
-            .observe(execution.as_secs_f64());
-        if let Some(per_timeline_execution_latency_histo) =
-            &inner.per_timeline_execution_latency_histo
-        {
-            per_timeline_execution_latency_histo.observe(execution.as_secs_f64());
-        }
-
-        // state transition
-        inner.timings = SmgrOpTimerState::Flushing;
-
-        // return the flush in progress object which
-        // will do the remaining metrics updates
-        let SmgrOpTimerInner {
-            global_flush_in_progress_micros,
-            per_timeline_flush_in_progress_micros,
-            ..
-        } = inner;
-        Some(SmgrOpFlushInProgress {
-            flush_started_at: at,
-            global_micros: global_flush_in_progress_micros,
-            per_timeline_micros: per_timeline_flush_in_progress_micros,
-        })
-    }
-}
-
-/// The last stage of request processing is serializing and flushing the request
-/// into the TCP connection. We want to make slow flushes observable
-/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
-/// to periodically bump the metric.
-///
-/// If in the future we decide that we're not interested in live updates, we can
-/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
-/// and remove this struct from the code base.
 pub(crate) struct SmgrOpFlushInProgress {
    flush_started_at: Instant,
    global_micros: IntCounter,
    per_timeline_micros: IntCounter,
 }

+impl SmgrOpTimer {
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
+        let inner = self.0.as_mut().expect("other public methods consume self");
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
+    }
+
+    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
+        let (flush_start, inner) = self
+            .smgr_op_end()
+            .expect("this method consume self, and the only other caller is drop handler");
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        SmgrOpFlushInProgress {
+            flush_started_at: flush_start,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        }
+    }
+
+    /// Returns `None`` if this method has already been called, `Some` otherwise.
+    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
+        let inner = self.0.take()?;
+
+        let now = Instant::now();
+
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
+            }
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }
+
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);
+
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
+        }
+
+        Some((now, inner))
+    }
+}
+
 impl Drop for SmgrOpTimer {
    fn drop(&mut self) {
-        // In case of early drop, update any of the remaining metrics with
-        // observations so that (started,finished) counter pairs balance out
-        // and all counters on the latency path have the the same number of
-        // observations.
-        // It's technically lying and it would be better if each metric had
-        // a separate label or similar for cancelled requests.
-        // But we don't have that right now and counter pairs balancing
-        // out is useful when using the metrics in panels and whatnot.
-        let now = Instant::now();
-        self.observe_throttle_start(now);
-        self.observe_throttle_done(ThrottleResult::NotThrottled { end: now });
-        self.observe_execution_start(now);
-        self.observe_execution_end_flush_start(now);
+        self.smgr_op_end();
    }
 }

@@ -1417,12 +1345,12 @@ impl SmgrOpFlushInProgress {
    {
        let mut fut = std::pin::pin!(fut);

+        let now = Instant::now();
        // Whenever observe_guard gets called, or dropped,
        // it adds the time elapsed since its last call to metrics.
        // Last call is tracked in `now`.
        let mut observe_guard = scopeguard::guard(
            || {
-                let now = Instant::now();
                let elapsed = now - self.flush_started_at;
                self.global_micros
                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
@@ -1463,10 +1391,9 @@ pub enum SmgrQueryType {
    GetPageAtLsn,
    GetDbSize,
    GetSlruSegment,
-    #[cfg(feature = "testing")]
-    Test,
 }

+#[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
    global_started: [IntCounter; SmgrQueryType::COUNT],
    global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1478,7 +1405,6 @@ pub(crate) struct SmgrQueryTimePerTimeline {
    per_timeline_flush_in_progress_micros: IntCounter,
    global_batch_wait_time: Histogram,
    per_timeline_batch_wait_time: Histogram,
-    throttling: Arc<tenant_throttling::Pagestream>,
 }

 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1684,11 +1610,7 @@ static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(||
 });

 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-        pagestream_throttle_metrics: Arc<tenant_throttling::Pagestream>,
-    ) -> Self {
+    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
@@ -1749,7 +1671,6 @@ impl SmgrQueryTimePerTimeline {
            per_timeline_flush_in_progress_micros,
            global_batch_wait_time,
            per_timeline_batch_wait_time,
-            throttling: pagestream_throttle_metrics,
        }
    }
    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
@@ -1765,24 +1686,88 @@ impl SmgrQueryTimePerTimeline {
        SmgrOpTimer(Some(SmgrOpTimerInner {
            global_execution_latency_histo: self.global_latency[op as usize].clone(),
            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
            per_timeline_flush_in_progress_micros: self
                .per_timeline_flush_in_progress_micros
                .clone(),
            global_batch_wait_time: self.global_batch_wait_time.clone(),
            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
-            throttling: self.throttling.clone(),
-            timings: SmgrOpTimerState::Received { received_at },
        }))
    }

-    /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
        self.global_batch_size.observe(batch_size as f64);
        self.per_timeline_batch_size.observe(batch_size as f64);
    }
 }

+#[cfg(test)]
+mod smgr_query_time_tests {
+    use std::time::Instant;
+
+    use pageserver_api::shard::TenantShardId;
+    use strum::IntoEnumIterator;
+    use utils::id::{TenantId, TimelineId};
+
+    // Regression test, we used hard-coded string constants before using an enum.
+    #[test]
+    fn op_label_name() {
+        use super::SmgrQueryType::*;
+        let expect: [(super::SmgrQueryType, &'static str); 5] = [
+            (GetRelExists, "get_rel_exists"),
+            (GetRelSize, "get_rel_size"),
+            (GetPageAtLsn, "get_page_at_lsn"),
+            (GetDbSize, "get_db_size"),
+            (GetSlruSegment, "get_slru_segment"),
+        ];
+        for (op, expect) in expect {
+            let actual: &'static str = op.into();
+            assert_eq!(actual, expect);
+        }
+    }
+
+    #[test]
+    fn basic() {
+        let ops: Vec<_> = super::SmgrQueryType::iter().collect();
+
+        for op in &ops {
+            let tenant_id = TenantId::generate();
+            let timeline_id = TimelineId::generate();
+            let metrics = super::SmgrQueryTimePerTimeline::new(
+                &TenantShardId::unsharded(tenant_id),
+                &timeline_id,
+            );
+
+            let get_counts = || {
+                let global: u64 = ops
+                    .iter()
+                    .map(|op| metrics.global_latency[*op as usize].get_sample_count())
+                    .sum();
+                (
+                    global,
+                    metrics.per_timeline_getpage_latency.get_sample_count(),
+                )
+            };
+
+            let (pre_global, pre_per_tenant_timeline) = get_counts();
+            assert_eq!(pre_per_tenant_timeline, 0);
+
+            let timer = metrics.start_smgr_op(*op, Instant::now());
+            drop(timer);
+
+            let (post_global, post_per_tenant_timeline) = get_counts();
+            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
+                // getpage ops are tracked per-timeline, others aren't
+                assert_eq!(post_per_tenant_timeline, 1);
+            } else {
+                assert_eq!(post_per_tenant_timeline, 0);
+            }
+            assert!(post_global > pre_global);
+        }
+    }
+}
+
 // keep in sync with control plane Go code so that we can validate
 // compute's basebackup_ms metric with our perspective in the context of SLI/SLO.
 static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
@@ -1860,7 +1845,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {

 #[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
 pub(crate) enum ComputeCommandKind {
-    PageStreamV3,
    PageStreamV2,
    Basebackup,
    Fullbackup,
@@ -2344,15 +2328,13 @@ macro_rules! redo_bytes_histogram_count_buckets {
 pub(crate) struct WalIngestMetrics {
    pub(crate) bytes_received: IntCounter,
    pub(crate) records_received: IntCounter,
-    pub(crate) records_observed: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
    pub(crate) clear_vm_bits_unknown: IntCounterVec,
 }

-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
-    WalIngestMetrics {
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
    bytes_received: register_int_counter!(
        "pageserver_wal_ingest_bytes_received",
        "Bytes of WAL ingested from safekeepers",
@@ -2363,11 +2345,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
        "Number of WAL records received from safekeepers"
    )
    .expect("failed to define a metric"),
-    records_observed: register_int_counter!(
-        "pageserver_wal_ingest_records_observed",
-        "Number of WAL records observed from safekeepers. These are metadata only records for shard 0."
-    )
-    .expect("failed to define a metric"),
    records_committed: register_int_counter!(
        "pageserver_wal_ingest_records_committed",
        "Number of WAL records which resulted in writes to pageserver storage"
@@ -2389,7 +2366,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
        &["entity"],
    )
    .expect("failed to define a metric"),
-}
 });

 pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -2601,7 +2577,6 @@ pub(crate) struct TimelineMetrics {
    shard_id: String,
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
-    pub flush_wait_upload_time_gauge: Gauge,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
@@ -2647,9 +2622,6 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        let compact_time_histo = StorageTimeMetrics::new(
            StorageTimeOperation::Compact,
            &tenant_id,
@@ -2795,7 +2767,6 @@ impl TimelineMetrics {
            shard_id,
            timeline_id,
            flush_time_histo,
-            flush_wait_upload_time_gauge,
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
@@ -2845,14 +2816,6 @@ impl TimelineMetrics {
        self.resident_physical_size_gauge.get()
    }

-    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
-        self.flush_wait_upload_time_gauge.add(duration);
-        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
-            .unwrap()
-            .add(duration);
-    }
-
    pub(crate) fn shutdown(&self) {
        let was_shutdown = self
            .shutdown
@@ -2870,7 +2833,6 @@ impl TimelineMetrics {
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -3578,7 +3540,9 @@ pub(crate) mod tenant_throttling {
    use once_cell::sync::Lazy;
    use utils::shard::TenantShardId;

-    pub(crate) struct GlobalAndPerTenantIntCounter {
+    use crate::tenant::{self};
+
+    struct GlobalAndPerTenantIntCounter {
        global: IntCounter,
        per_tenant: IntCounter,
    }
@@ -3596,10 +3560,10 @@ pub(crate) mod tenant_throttling {
    }

    pub(crate) struct Metrics<const KIND: usize> {
-        pub(super) count_accounted_start: GlobalAndPerTenantIntCounter,
-        pub(super) count_accounted_finish: GlobalAndPerTenantIntCounter,
-        pub(super) wait_time: GlobalAndPerTenantIntCounter,
-        pub(super) count_throttled: GlobalAndPerTenantIntCounter,
+        count_accounted_start: GlobalAndPerTenantIntCounter,
+        count_accounted_finish: GlobalAndPerTenantIntCounter,
+        wait_time: GlobalAndPerTenantIntCounter,
+        count_throttled: GlobalAndPerTenantIntCounter,
    }

    static COUNT_ACCOUNTED_START: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
@@ -3734,6 +3698,26 @@ pub(crate) mod tenant_throttling {
            }
        }
    }
+
+    impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
+        #[inline(always)]
+        fn accounting_start(&self) {
+            self.count_accounted_start.inc();
+        }
+        #[inline(always)]
+        fn accounting_finish(&self) {
+            self.count_accounted_finish.inc();
+        }
+        #[inline(always)]
+        fn observe_throttling(
+            &self,
+            tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
+        ) {
+            let val = u64::try_from(wait_time.as_micros()).unwrap();
+            self.wait_time.inc_by(val);
+            self.count_throttled.inc();
+        }
+    }
 }

 pub(crate) mod disk_usage_based_eviction {
@@ -3878,6 +3862,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {

    // histograms
    [
+        &READ_NUM_LAYERS_VISITED,
        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -17,7 +17,7 @@ use pageserver_api::models::{
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
    PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
-    PagestreamProtocolVersion, PagestreamRequest,
+    PagestreamProtocolVersion,
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{
@@ -67,7 +67,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::{basebackup, timed_after_cancellation};
 use pageserver_api::key::rel_block_to_key;
-use pageserver_api::reltag::SlruKind;
+use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -537,29 +537,6 @@ impl From<WaitLsnError> for QueryError {
    }
 }

-#[derive(thiserror::Error, Debug)]
-struct BatchedPageStreamError {
-    req: PagestreamRequest,
-    err: PageStreamError,
-}
-
-impl std::fmt::Display for BatchedPageStreamError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        self.err.fmt(f)
-    }
-}
-
-struct BatchedGetPageRequest {
-    req: PagestreamGetPageRequest,
-    timer: SmgrOpTimer,
-}
-
-#[cfg(feature = "testing")]
-struct BatchedTestRequest {
-    req: models::PagestreamTestRequest,
-    timer: SmgrOpTimer,
-}
-
 enum BatchedFeMessage {
    Exists {
        span: Span,
@@ -577,7 +554,7 @@ enum BatchedFeMessage {
        span: Span,
        shard: timeline::handle::Handle<TenantManagerTypes>,
        effective_request_lsn: Lsn,
-        pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
    },
    DbSize {
        span: Span,
@@ -591,40 +568,47 @@ enum BatchedFeMessage {
        shard: timeline::handle::Handle<TenantManagerTypes>,
        req: models::PagestreamGetSlruSegmentRequest,
    },
-    #[cfg(feature = "testing")]
-    Test {
-        span: Span,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
-        requests: Vec<BatchedTestRequest>,
-    },
    RespondError {
        span: Span,
-        error: BatchedPageStreamError,
+        error: PageStreamError,
    },
 }

 impl BatchedFeMessage {
-    fn observe_execution_start(&mut self, at: Instant) {
-        match self {
-            BatchedFeMessage::Exists { timer, .. }
-            | BatchedFeMessage::Nblocks { timer, .. }
-            | BatchedFeMessage::DbSize { timer, .. }
-            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
-                timer.observe_execution_start(at);
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
+        let (shard, tokens, timers) = match self {
+            BatchedFeMessage::Exists { shard, timer, .. }
+            | BatchedFeMessage::Nblocks { shard, timer, .. }
+            | BatchedFeMessage::DbSize { shard, timer, .. }
+            | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
+                (
+                    shard,
+                    // 1 token is probably under-estimating because these
+                    // request handlers typically do several Timeline::get calls.
+                    1,
+                    itertools::Either::Left(std::iter::once(timer)),
+                )
            }
-            BatchedFeMessage::GetPage { pages, .. } => {
-                for page in pages {
-                    page.timer.observe_execution_start(at);
-                }
+            BatchedFeMessage::GetPage { shard, pages, .. } => (
+                shard,
+                pages.len(),
+                itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
+            ),
+            BatchedFeMessage::RespondError { .. } => return Ok(()),
+        };
+        let throttled = tokio::select! {
+            throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
+            _ = cancel.cancelled() => {
+                return Err(QueryError::Shutdown);
            }
-            #[cfg(feature = "testing")]
-            BatchedFeMessage::Test { requests, .. } => {
-                for req in requests {
-                    req.timer.observe_execution_start(at);
-                }
-            }
-            BatchedFeMessage::RespondError { .. } => {}
+        };
+        for timer in timers {
+            timer.observe_throttle_done_execution_starting(&throttled);
        }
+        Ok(())
    }
 }

@@ -670,7 +654,6 @@ impl PageServerHandler {
        )
    }

-    #[allow(clippy::too_many_arguments)]
    async fn pagestream_read_message<IO>(
        pgb: &mut PostgresBackendReader<IO>,
        tenant_id: TenantId,
@@ -678,7 +661,6 @@ impl PageServerHandler {
        timeline_handles: &mut TimelineHandles,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-        protocol_version: PagestreamProtocolVersion,
        parent_span: Span,
    ) -> Result<Option<BatchedFeMessage>, QueryError>
    where
@@ -713,42 +695,18 @@ impl PageServerHandler {
        fail::fail_point!("ps::handle-pagerequest-message");

        // parse request
-        let neon_fe_msg =
-            PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
-
-        // TODO: turn in to async closure once available to avoid repeating received_at
-        async fn record_op_start_and_throttle(
-            shard: &timeline::handle::Handle<TenantManagerTypes>,
-            op: metrics::SmgrQueryType,
-            received_at: Instant,
-        ) -> Result<SmgrOpTimer, QueryError> {
-            // It's important to start the smgr op metric recorder as early as possible
-            // so that the _started counters are incremented before we do
-            // any serious waiting, e.g., for throttle, batching, or actual request handling.
-            let mut timer = shard.query_metrics.start_smgr_op(op, received_at);
-            let now = Instant::now();
-            timer.observe_throttle_start(now);
-            let throttled = tokio::select! {
-                res = shard.pagestream_throttle.throttle(1, now) => res,
-                _ = shard.cancel.cancelled() => return Err(QueryError::Shutdown),
-            };
-            timer.observe_throttle_done(throttled);
-            Ok(timer)
-        }
+        let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

        let batched_msg = match neon_fe_msg {
            PagestreamFeMessage::Exists(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = record_op_start_and_throttle(
-                    &shard,
-                    metrics::SmgrQueryType::GetRelExists,
-                    received_at,
-                )
-                .await?;
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
                BatchedFeMessage::Exists {
                    span,
                    timer,
@@ -757,17 +715,14 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::Nblocks(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = record_op_start_and_throttle(
-                    &shard,
-                    metrics::SmgrQueryType::GetRelSize,
-                    received_at,
-                )
-                .await?;
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
                BatchedFeMessage::Nblocks {
                    span,
                    timer,
@@ -776,17 +731,14 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::DbSize(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = record_op_start_and_throttle(
-                    &shard,
-                    metrics::SmgrQueryType::GetDbSize,
-                    received_at,
-                )
-                .await?;
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
                BatchedFeMessage::DbSize {
                    span,
                    timer,
@@ -795,17 +747,14 @@ impl PageServerHandler {
                }
            }
            PagestreamFeMessage::GetSlruSegment(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn);
+                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
                let shard = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .instrument(span.clone()) // sets `shard_id` field
                    .await?;
-                let timer = record_op_start_and_throttle(
-                    &shard,
-                    metrics::SmgrQueryType::GetSlruSegment,
-                    received_at,
-                )
-                .await?;
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
                BatchedFeMessage::GetSlruSegment {
                    span,
                    timer,
@@ -813,23 +762,25 @@ impl PageServerHandler {
                    req,
                }
            }
-            PagestreamFeMessage::GetPage(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %req.hdr.request_lsn);
+            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                request_lsn,
+                not_modified_since,
+                rel,
+                blkno,
+            }) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn);

                macro_rules! respond_error {
                    ($error:expr) => {{
                        let error = BatchedFeMessage::RespondError {
                            span,
-                            error: BatchedPageStreamError {
-                                req: req.hdr,
-                                err: $error,
-                            },
+                            error: $error,
                        };
                        Ok(Some(error))
                    }};
                }

-                let key = rel_block_to_key(req.rel, req.blkno);
+                let key = rel_block_to_key(rel, blkno);
                let shard = match timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Page(key))
                    .instrument(span.clone()) // sets `shard_id` field
@@ -854,17 +805,17 @@ impl PageServerHandler {
                    }
                };

-                let timer = record_op_start_and_throttle(
-                    &shard,
-                    metrics::SmgrQueryType::GetPageAtLsn,
-                    received_at,
-                )
-                .await?;
+                // It's important to start the timer before waiting for the LSN
+                // so that the _started counters are incremented before we do
+                // any serious waiting, e.g., for LSNs.
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);

                let effective_request_lsn = match Self::wait_or_get_last_lsn(
                    &shard,
-                    req.hdr.request_lsn,
-                    req.hdr.not_modified_since,
+                    request_lsn,
+                    not_modified_since,
                    &shard.get_latest_gc_cutoff_lsn(),
                    ctx,
                )
@@ -880,23 +831,7 @@ impl PageServerHandler {
                    span,
                    shard,
                    effective_request_lsn,
-                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
-                }
-            }
-            #[cfg(feature = "testing")]
-            PagestreamFeMessage::Test(req) => {
-                let span = tracing::info_span!(parent: parent_span, "handle_test_request");
-                let shard = timeline_handles
-                    .get(tenant_id, timeline_id, ShardSelector::Zero)
-                    .instrument(span.clone()) // sets `shard_id` field
-                    .await?;
-                let timer =
-                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
-                        .await?;
-                BatchedFeMessage::Test {
-                    span,
-                    shard,
-                    requests: vec![BatchedTestRequest { req, timer }],
+                    pages: smallvec::smallvec![(rel, blkno, timer)],
                }
            }
        };
@@ -961,46 +896,6 @@ impl PageServerHandler {
                accum_pages.extend(this_pages);
                Ok(())
            }
-            #[cfg(feature = "testing")]
-            (
-                Ok(BatchedFeMessage::Test {
-                    shard: accum_shard,
-                    requests: accum_requests,
-                    ..
-                }),
-                BatchedFeMessage::Test {
-                    shard: this_shard,
-                    requests: this_requests,
-                    ..
-                },
-            ) if (|| {
-                assert!(this_requests.len() == 1);
-                if accum_requests.len() >= max_batch_size.get() {
-                    trace!(%max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_requests.len(), max_batch_size.get());
-                    return false;
-                }
-                if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
-                    != (this_shard.tenant_shard_id, this_shard.timeline_id)
-                {
-                    trace!("stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                let this_batch_key = this_requests[0].req.batch_key;
-                let accum_batch_key = accum_requests[0].req.batch_key;
-                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
-                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
-                    return false;
-                }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_requests.extend(this_requests);
-                Ok(())
-            }
            // something batched already but this message is unbatchable
            (_, this_msg) => {
                // by default, don't continue batching
@@ -1015,22 +910,14 @@ impl PageServerHandler {
        pgb_writer: &mut PostgresBackend<IO>,
        batch: BatchedFeMessage,
        cancel: &CancellationToken,
-        protocol_version: PagestreamProtocolVersion,
        ctx: &RequestContext,
    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        let started_at = Instant::now();
-        let batch = {
-            let mut batch = batch;
-            batch.observe_execution_start(started_at);
-            batch
-        };
-
        // invoke handler function
        let (handler_results, span): (
-            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
            _,
        ) = match batch {
            BatchedFeMessage::Exists {
@@ -1045,8 +932,7 @@ impl PageServerHandler {
                        .handle_get_rel_exists_request(&shard, &req, ctx)
                        .instrument(span.clone())
                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
+                        .map(|msg| (msg, timer))],
                    span,
                )
            }
@@ -1062,8 +948,7 @@ impl PageServerHandler {
                        .handle_get_nblocks_request(&shard, &req, ctx)
                        .instrument(span.clone())
                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
+                        .map(|msg| (msg, timer))],
                    span,
                )
            }
@@ -1105,8 +990,7 @@ impl PageServerHandler {
                        .handle_db_size_request(&shard, &req, ctx)
                        .instrument(span.clone())
                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
+                        .map(|msg| (msg, timer))],
                    span,
                )
            }
@@ -1122,29 +1006,7 @@ impl PageServerHandler {
                        .handle_get_slru_segment_request(&shard, &req, ctx)
                        .instrument(span.clone())
                        .await
-                        .map(|msg| (msg, timer))
-                        .map_err(|err| BatchedPageStreamError { err, req: req.hdr })],
-                    span,
-                )
-            }
-            #[cfg(feature = "testing")]
-            BatchedFeMessage::Test {
-                span,
-                shard,
-                requests,
-            } => {
-                fail::fail_point!("ps::handle-pagerequest-message::test");
-                (
-                    {
-                        let npages = requests.len();
-                        trace!(npages, "handling getpage request");
-                        let res = self
-                            .handle_test_request_batch(&shard, requests, ctx)
-                            .instrument(span.clone())
-                            .await;
-                        assert_eq!(res.len(), npages);
-                        res
-                    },
+                        .map(|msg| (msg, timer))],
                    span,
                )
            }
@@ -1160,7 +1022,7 @@ impl PageServerHandler {
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
        for handler_result in handler_results {
            let (response_msg, timer) = match handler_result {
-                Err(e) => match &e.err {
+                Err(e) => match &e {
                    PageStreamError::Shutdown => {
                        // If we fail to fulfil a request during shutdown, which may be _because_ of
                        // shutdown, then do not send the error to the client.  Instead just drop the
@@ -1179,14 +1041,13 @@ impl PageServerHandler {
                        // print the all details to the log with {:#}, but for the client the
                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
                        // here includes cancellation which is not an error.
-                        let full = utils::error::report_compact_sources(&e.err);
+                        let full = utils::error::report_compact_sources(&e);
                        span.in_scope(|| {
                            error!("error reading relation or page version: {full:#}")
                        });
                        (
                            PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                req: e.req,
-                                message: e.err.to_string(),
+                                message: e.to_string(),
                            }),
                            None, // TODO: measure errors
                        )
@@ -1199,9 +1060,7 @@ impl PageServerHandler {
            // marshal & transmit response message
            //

-            pgb_writer.write_message_noflush(&BeMessage::CopyData(
-                &response_msg.serialize(protocol_version),
-            ))?;
+            pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;

            // We purposefully don't count flush time into the timer.
            //
@@ -1215,11 +1074,8 @@ impl PageServerHandler {
            // The timer's underlying metric is used for a storage-internal latency SLO and
            // we don't want to include latency in it that we can't control.
            // And as pointed out above, in this case, we don't control the time that flush will take.
-            let flushing_timer = timer.map(|mut timer| {
-                timer
-                    .observe_execution_end_flush_start(Instant::now())
-                    .expect("we are the first caller")
-            });
+            let flushing_timer =
+                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());

            // what we want to do
            let flush_fut = pgb_writer.flush();
@@ -1267,7 +1123,7 @@ impl PageServerHandler {
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        protocol_version: PagestreamProtocolVersion,
+        _protocol_version: PagestreamProtocolVersion,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -1307,7 +1163,6 @@ impl PageServerHandler {
                    timeline_handles,
                    request_span,
                    pipelining_config,
-                    protocol_version,
                    &ctx,
                )
                .await
@@ -1320,7 +1175,6 @@ impl PageServerHandler {
                    timeline_id,
                    timeline_handles,
                    request_span,
-                    protocol_version,
                    &ctx,
                )
                .await
@@ -1347,7 +1201,6 @@ impl PageServerHandler {
        timeline_id: TimelineId,
        mut timeline_handles: TimelineHandles,
        request_span: Span,
-        protocol_version: PagestreamProtocolVersion,
        ctx: &RequestContext,
    ) -> (
        (PostgresBackendReader<IO>, TimelineHandles),
@@ -1365,7 +1218,6 @@ impl PageServerHandler {
                &mut timeline_handles,
                &cancel,
                ctx,
-                protocol_version,
                request_span.clone(),
            )
            .await;
@@ -1373,7 +1225,7 @@ impl PageServerHandler {
                Ok(msg) => msg,
                Err(e) => break e,
            };
-            let msg = match msg {
+            let mut msg = match msg {
                Some(msg) => msg,
                None => {
                    debug!("pagestream subprotocol end observed");
@@ -1381,8 +1233,12 @@ impl PageServerHandler {
                }
            };

+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
+                break cancelled;
+            }
+
            let err = self
-                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, protocol_version, ctx)
+                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
                .await;
            match err {
                Ok(()) => {}
@@ -1405,7 +1261,6 @@ impl PageServerHandler {
        mut timeline_handles: TimelineHandles,
        request_span: Span,
        pipelining_config: PageServicePipeliningConfigPipelined,
-        protocol_version: PagestreamProtocolVersion,
        ctx: &RequestContext,
    ) -> (
        (PostgresBackendReader<IO>, TimelineHandles),
@@ -1503,7 +1358,6 @@ impl PageServerHandler {
                        &mut timeline_handles,
                        &cancel_batcher,
                        &ctx,
-                        protocol_version,
                        request_span.clone(),
                    )
                    .await;
@@ -1540,20 +1394,17 @@ impl PageServerHandler {
                            return Ok(());
                        }
                    };
-                    let batch = match batch {
+                    let mut batch = match batch {
                        Ok(batch) => batch,
                        Err(e) => {
                            return Err(e);
                        }
                    };
-                    self.pagesteam_handle_batched_message(
-                        pgb_writer,
-                        batch,
-                        &cancel,
-                        protocol_version,
-                        &ctx,
-                    )
-                    .await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
+                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
+                        .await?;
                }
            }
        });
@@ -1727,8 +1578,8 @@ impl PageServerHandler {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
-            req.hdr.request_lsn,
-            req.hdr.not_modified_since,
+            req.request_lsn,
+            req.not_modified_since,
            &latest_gc_cutoff_lsn,
            ctx,
        )
@@ -1739,7 +1590,6 @@ impl PageServerHandler {
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
-            req: *req,
            exists,
        }))
    }
@@ -1754,8 +1604,8 @@ impl PageServerHandler {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
-            req.hdr.request_lsn,
-            req.hdr.not_modified_since,
+            req.request_lsn,
+            req.not_modified_since,
            &latest_gc_cutoff_lsn,
            ctx,
        )
@@ -1766,7 +1616,6 @@ impl PageServerHandler {
            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
-            req: *req,
            n_blocks,
        }))
    }
@@ -1781,8 +1630,8 @@ impl PageServerHandler {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
-            req.hdr.request_lsn,
-            req.hdr.not_modified_since,
+            req.request_lsn,
+            req.not_modified_since,
            &latest_gc_cutoff_lsn,
            ctx,
        )
@@ -1794,7 +1643,6 @@ impl PageServerHandler {
        let db_size = total_blocks as i64 * BLCKSZ as i64;

        Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
-            req: *req,
            db_size,
        }))
    }
@@ -1804,9 +1652,9 @@ impl PageServerHandler {
        &mut self,
        timeline: &Timeline,
        effective_lsn: Lsn,
-        requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
        ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
        debug_assert_current_span_has_tenant_and_timeline_id();

        timeline
@@ -1815,7 +1663,7 @@ impl PageServerHandler {

        let results = timeline
            .get_rel_page_at_lsn_batched(
-                requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
+                requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
                effective_lsn,
                ctx,
            )
@@ -1827,20 +1675,16 @@ impl PageServerHandler {
            requests
                .into_iter()
                .zip(results.into_iter())
-                .map(|(req, res)| {
+                .map(|((_, _, timer), res)| {
                    res.map(|page| {
                        (
                            PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
-                                req: req.req,
                                page,
                            }),
-                            req.timer,
+                            timer,
                        )
                    })
-                    .map_err(|e| BatchedPageStreamError {
-                        err: PageStreamError::from(e),
-                        req: req.req.hdr,
-                    })
+                    .map_err(PageStreamError::from)
                }),
        )
    }
@@ -1855,8 +1699,8 @@ impl PageServerHandler {
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
-            req.hdr.request_lsn,
-            req.hdr.not_modified_since,
+            req.request_lsn,
+            req.not_modified_since,
            &latest_gc_cutoff_lsn,
            ctx,
        )
@@ -1867,55 +1711,10 @@ impl PageServerHandler {
        let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;

        Ok(PagestreamBeMessage::GetSlruSegment(
-            PagestreamGetSlruSegmentResponse { req: *req, segment },
+            PagestreamGetSlruSegmentResponse { segment },
        ))
    }

-    // NB: this impl mimics what we do for batched getpage requests.
-    #[cfg(feature = "testing")]
-    #[instrument(skip_all, fields(shard_id))]
-    async fn handle_test_request_batch(
-        &mut self,
-        timeline: &Timeline,
-        requests: Vec<BatchedTestRequest>,
-        _ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
-        // real requests would do something with the timeline
-        let mut results = Vec::with_capacity(requests.len());
-        for _req in requests.iter() {
-            tokio::task::yield_now().await;
-
-            results.push({
-                if timeline.cancel.is_cancelled() {
-                    Err(PageReconstructError::Cancelled)
-                } else {
-                    Ok(())
-                }
-            });
-        }
-
-        // TODO: avoid creating the new Vec here
-        Vec::from_iter(
-            requests
-                .into_iter()
-                .zip(results.into_iter())
-                .map(|(req, res)| {
-                    res.map(|()| {
-                        (
-                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
-                                req: req.req.clone(),
-                            }),
-                            req.timer,
-                        )
-                    })
-                    .map_err(|e| BatchedPageStreamError {
-                        err: PageStreamError::from(e),
-                        req: req.req.hdr,
-                    })
-                }),
-        )
-    }
-
    /// Note on "fullbackup":
    /// Full basebackups should only be used for debugging purposes.
    /// Originally, it was introduced to enable breaking storage format changes,
@@ -2107,7 +1906,6 @@ struct FullBackupCmd {
 struct PageStreamCmd {
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    protocol_version: PagestreamProtocolVersion,
 }

 /// `lease lsn tenant timeline lsn`
@@ -2128,7 +1926,7 @@ enum PageServiceCmd {
 }

 impl PageStreamCmd {
-    fn parse(query: &str, protocol_version: PagestreamProtocolVersion) -> anyhow::Result<Self> {
+    fn parse(query: &str) -> anyhow::Result<Self> {
        let parameters = query.split_whitespace().collect_vec();
        if parameters.len() != 2 {
            bail!(
@@ -2143,7 +1941,6 @@ impl PageStreamCmd {
        Ok(Self {
            tenant_id,
            timeline_id,
-            protocol_version,
        })
    }
 }
@@ -2281,14 +2078,7 @@ impl PageServiceCmd {
            bail!("cannot parse query: {query}")
        };
        match cmd.to_ascii_lowercase().as_str() {
-            "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(
-                other,
-                PagestreamProtocolVersion::V2,
-            )?)),
-            "pagestream_v3" => Ok(Self::PageStream(PageStreamCmd::parse(
-                other,
-                PagestreamProtocolVersion::V3,
-            )?)),
+            "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
            "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
            "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
            "lease" => {
@@ -2370,21 +2160,25 @@ where
            PageServiceCmd::PageStream(PageStreamCmd {
                tenant_id,
                timeline_id,
-                protocol_version,
            }) => {
                tracing::Span::current()
                    .record("tenant_id", field::display(tenant_id))
                    .record("timeline_id", field::display(timeline_id));

                self.check_permission(Some(tenant_id))?;
-                let command_kind = match protocol_version {
-                    PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
-                    PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
-                };
-                COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();

-                self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
-                    .await?;
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::PageStreamV2)
+                    .inc();
+
+                self.handle_pagerequests(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    PagestreamProtocolVersion::V2,
+                    ctx,
+                )
+                .await?;
            }
            PageServiceCmd::BaseBackup(BaseBackupCmd {
                tenant_id,
@@ -2563,8 +2357,7 @@ mod tests {
            cmd,
            PageServiceCmd::PageStream(PageStreamCmd {
                tenant_id,
-                timeline_id,
-                protocol_version: PagestreamProtocolVersion::V2,
+                timeline_id
            })
        );
        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -627,7 +627,7 @@ impl Timeline {
            // cannot overflow, high and low are both smaller than u64::MAX / 2
            let mid = (high + low) / 2;

-            let cmp = match self
+            let cmp = self
                .is_latest_commit_timestamp_ge_than(
                    search_timestamp,
                    Lsn(mid * 8),
@@ -635,16 +635,7 @@ impl Timeline {
                    &mut found_larger,
                    ctx,
                )
-                .await
-            {
-                Ok(res) => res,
-                Err(PageReconstructError::MissingKey(e)) => {
-                    warn!("Missing key while find_lsn_for_timestamp. Either we might have already garbage-collected that data or the key is really missing. Last error: {:#}", e);
-                    // Return that we didn't find any requests smaller than the LSN, and logging the error.
-                    return Ok(LsnForTimestamp::Past(min_lsn));
-                }
-                Err(e) => return Err(e),
-            };
+                .await?;

            if cmp {
                high = mid;
@@ -652,7 +643,6 @@ impl Timeline {
                low = mid + 1;
            }
        }
-
        // If `found_smaller == true`, `low = t + 1` where `t` is the target LSN,
        // so the LSN of the last commit record before or at `search_timestamp`.
        // Remove one from `low` to get `t`.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,7 +21,6 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
-use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -38,17 +37,20 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
+use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
-use timeline::compaction::GcCompactionQueue;
+use timeline::compaction::GcCompactJob;
+use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
-use timeline::offload::OffloadError;
+use timeline::CompactFlags;
 use timeline::CompactOptions;
+use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -344,8 +346,10 @@ pub struct Tenant {
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

-    /// Scheduled gc-compaction tasks.
-    scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,
+    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
+    /// a manual gc-compaction from the manual compaction API.
+    scheduled_compaction_tasks:
+        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,

    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -365,9 +369,8 @@ pub struct Tenant {

    /// Throttle applied at the top of [`Timeline::get`].
    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
-    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,
-
-    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
+    pub(crate) pagestream_throttle:
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,

    /// An ongoing timeline detach concurrency limiter.
    ///
@@ -1688,7 +1691,6 @@ impl Tenant {
                    TimelineResources {
                        remote_client,
                        pagestream_throttle: self.pagestream_throttle.clone(),
-                        pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
                        l0_flush_global_state: self.l0_flush_global_state.clone(),
                    },
                    LoadTimelineCause::Attach,
@@ -2037,7 +2039,7 @@ impl Tenant {
    ) -> Result<Arc<Timeline>, TimelineArchivalError> {
        info!("unoffloading timeline");

-        // We activate the timeline below manually, so this must be called on an active tenant.
+        // We activate the timeline below manually, so this must be called on an active timeline.
        // We expect callers of this function to ensure this.
        match self.current_state() {
            TenantState::Activating { .. }
@@ -2602,15 +2604,9 @@ impl Tenant {
                WaitCompletionError::NotInitialized(
                    e, // If the queue is already stopped, it's a shutdown error.
                ) if e.is_stopping() => CreateTimelineError::ShuttingDown,
-                WaitCompletionError::NotInitialized(_) => {
-                    // This is a bug: we should never try to wait for uploads before initializing the timeline
-                    debug_assert!(false);
-                    CreateTimelineError::Other(anyhow::anyhow!("timeline not initialized"))
-                }
-                WaitCompletionError::UploadQueueShutDownOrStopped => {
-                    CreateTimelineError::ShuttingDown
-                }
-            })?;
+                e => CreateTimelineError::Other(e.into()),
+            })
+            .context("wait for timeline initial uploads to complete")?;

        // The creating task is responsible for activating the timeline.
        // We do this after `wait_completion()` so that we don't spin up tasks that start
@@ -2994,35 +2990,113 @@ impl Tenant {
                if has_pending_l0_compaction_task {
                    Some(true)
                } else {
-                    let queue = {
-                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        guard.get(timeline_id).cloned()
+                    let mut has_pending_scheduled_compaction_task;
+                    let next_scheduled_compaction_task = {
+                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            if !tline_pending_tasks.is_empty() {
+                                info!(
+                                    "{} tasks left in the compaction schedule queue",
+                                    tline_pending_tasks.len()
+                                );
+                            }
+                            let next_task = tline_pending_tasks.pop_front();
+                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
+                            next_task
+                        } else {
+                            has_pending_scheduled_compaction_task = false;
+                            None
+                        }
                    };
-                    if let Some(queue) = queue {
-                        let has_pending_tasks = queue
-                            .iteration(cancel, ctx, &self.gc_block, timeline)
-                            .await?;
-                        Some(has_pending_tasks)
-                    } else {
-                        Some(false)
+                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
+                    {
+                        if !next_scheduled_compaction_task
+                            .options
+                            .flags
+                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                        {
+                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else if next_scheduled_compaction_task.options.sub_compaction {
+                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+                            let jobs: Vec<GcCompactJob> = timeline
+                                .gc_compaction_split_jobs(
+                                    GcCompactJob::from_compact_options(
+                                        next_scheduled_compaction_task.options.clone(),
+                                    ),
+                                    next_scheduled_compaction_task
+                                        .options
+                                        .sub_compaction_max_job_size_mb,
+                                )
+                                .await
+                                .map_err(CompactionError::Other)?;
+                            if jobs.is_empty() {
+                                info!("no jobs to run, skipping scheduled compaction task");
+                            } else {
+                                has_pending_scheduled_compaction_task = true;
+                                let jobs_len = jobs.len();
+                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
+                                for (idx, job) in jobs.into_iter().enumerate() {
+                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                                    // until we do further refactors to allow directly call `compact_with_gc`.
+                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                                    if job.dry_run {
+                                        flags |= CompactFlags::DryRun;
+                                    }
+                                    let options = CompactOptions {
+                                        flags,
+                                        sub_compaction: false,
+                                        compact_key_range: Some(job.compact_key_range.into()),
+                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
+                                        sub_compaction_max_job_size_mb: None,
+                                    };
+                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
+                                        ScheduledCompactionTask {
+                                            options,
+                                            // The last job in the queue sends the signal and releases the gc guard
+                                            result_tx: next_scheduled_compaction_task
+                                                .result_tx
+                                                .take(),
+                                            gc_block: next_scheduled_compaction_task
+                                                .gc_block
+                                                .take(),
+                                        }
+                                    } else {
+                                        ScheduledCompactionTask {
+                                            options,
+                                            result_tx: None,
+                                            gc_block: None,
+                                        }
+                                    });
+                                }
+                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+                            }
+                        } else {
+                            let _ = timeline
+                                .compact_with_options(
+                                    cancel,
+                                    next_scheduled_compaction_task.options,
+                                    ctx,
+                                )
+                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
+                                .await?;
+                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
+                                // TODO: we can send compaction statistics in the future
+                                tx.send(()).ok();
+                            }
+                        }
                    }
+                    Some(has_pending_scheduled_compaction_task)
                }
            } else {
                None
            };
            has_pending_task |= pending_task_left.unwrap_or(false);
            if pending_task_left == Some(false) && *can_offload {
-                pausable_failpoint!("before-timeline-auto-offload");
-                match offload_timeline(self, timeline)
+                offload_timeline(self, timeline)
                    .instrument(info_span!("offload_timeline", %timeline_id))
-                    .await
-                {
-                    Err(OffloadError::NotArchived) => {
-                        // Ignore this, we likely raced with unarchival
-                        Ok(())
-                    }
-                    other => other,
-                }?;
+                    .await?;
            }
        }

@@ -3035,32 +3109,34 @@ impl Tenant {
    }

    /// Cancel scheduled compaction tasks
-    pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
+    pub(crate) fn cancel_scheduled_compaction(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Vec<ScheduledCompactionTask> {
        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        if let Some(q) = guard.get_mut(&timeline_id) {
-            q.cancel_scheduled();
+        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
+            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
+            current_tline_pending_tasks.into_iter().collect()
+        } else {
+            Vec::new()
        }
    }

    pub(crate) fn get_scheduled_compaction_tasks(
        &self,
        timeline_id: TimelineId,
-    ) -> Vec<CompactInfoResponse> {
-        let res = {
-            let guard = self.scheduled_compaction_tasks.lock().unwrap();
-            guard.get(&timeline_id).map(|q| q.remaining_jobs())
-        };
-        let Some((running, remaining)) = res else {
-            return Vec::new();
-        };
-        let mut result = Vec::new();
-        if let Some((id, running)) = running {
-            result.extend(running.into_compact_info_resp(id, true));
-        }
-        for (id, job) in remaining {
-            result.extend(job.into_compact_info_resp(id, false));
-        }
-        result
+    ) -> Vec<CompactOptions> {
+        use itertools::Itertools;
+        let guard = self.scheduled_compaction_tasks.lock().unwrap();
+        guard
+            .get(&timeline_id)
+            .map(|tline_pending_tasks| {
+                tline_pending_tasks
+                    .iter()
+                    .map(|x| x.options.clone())
+                    .collect_vec()
+            })
+            .unwrap_or_default()
    }

    /// Schedule a compaction task for a timeline.
@@ -3069,12 +3145,20 @@ impl Tenant {
        timeline_id: TimelineId,
        options: CompactOptions,
    ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
+        let gc_guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(e) => {
+                bail!("cannot run gc-compaction because gc is blocked: {}", e);
+            }
+        };
        let (tx, rx) = tokio::sync::oneshot::channel();
        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        let q = guard
-            .entry(timeline_id)
-            .or_insert_with(|| Arc::new(GcCompactionQueue::new()));
-        q.schedule_manual_compaction(options, Some(tx));
+        let tline_pending_tasks = guard.entry(timeline_id).or_default();
+        tline_pending_tasks.push_back(ScheduledCompactionTask {
+            options,
+            result_tx: Some(tx),
+            gc_block: Some(gc_guard),
+        });
        Ok(rx)
    }

@@ -3994,9 +4078,6 @@ impl Tenant {
        Ok(timeline)
    }

-    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
-    /// to ensure proper cleanup of background tasks and metrics.
-    //
    // Allow too_many_arguments because a constructor's argument list naturally grows with the
    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
    #[allow(clippy::too_many_arguments)]
@@ -4105,10 +4186,8 @@ impl Tenant {
            gate: Gate::default(),
            pagestream_throttle: Arc::new(throttle::Throttle::new(
                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
            )),
-            pagestream_throttle_metrics: Arc::new(
-                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
-            ),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
            gc_block: Default::default(),
@@ -4403,17 +4482,13 @@ impl Tenant {
        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
            HashMap::with_capacity(timelines.len());

-        // Ensures all timelines use the same start time when computing the time cutoff.
-        let now_ts_for_pitr_calc = SystemTime::now();
        for timeline in timelines.iter() {
            let cutoff = timeline
                .get_last_record_lsn()
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

-            let cutoffs = timeline
-                .find_gc_cutoffs(now_ts_for_pitr_calc, cutoff, pitr, cancel, ctx)
-                .await?;
+            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
            assert!(old.is_none());
        }
@@ -5015,7 +5090,6 @@ impl Tenant {
        TimelineResources {
            remote_client: self.build_timeline_remote_client(timeline_id),
            pagestream_throttle: self.pagestream_throttle.clone(),
-            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
        }
    }
@@ -5690,7 +5764,7 @@ mod tests {
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
    use itertools::Itertools;
-    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    use pageserver_api::value::Value;
@@ -7749,18 +7823,7 @@ mod tests {
        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
-        let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
-
-        let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
-        let base_inherited_key_child =
-            Key::from_hex("610000000033333333444444445500000001").unwrap();
-        let base_inherited_key_nonexist =
-            Key::from_hex("610000000033333333444444445500000002").unwrap();
-        let base_inherited_key_overwrite =
-            Key::from_hex("610000000033333333444444445500000003").unwrap();
-
        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
-        assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);

        let tline = tenant
            .create_test_timeline_with_layers(
@@ -7769,18 +7832,7 @@ mod tests {
                DEFAULT_PG_VERSION,
                &ctx,
                Vec::new(), // delta layers
-                vec![(
-                    Lsn(0x20),
-                    vec![
-                        (base_inherited_key, test_img("metadata inherited key 1")),
-                        (
-                            base_inherited_key_overwrite,
-                            test_img("metadata key overwrite 1a"),
-                        ),
-                        (base_key, test_img("metadata key 1")),
-                        (base_key_overwrite, test_img("metadata key overwrite 1b")),
-                    ],
-                )], // image layers
+                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;
@@ -7794,18 +7846,7 @@ mod tests {
                Vec::new(), // delta layers
                vec![(
                    Lsn(0x30),
-                    vec![
-                        (
-                            base_inherited_key_child,
-                            test_img("metadata inherited key 2"),
-                        ),
-                        (
-                            base_inherited_key_overwrite,
-                            test_img("metadata key overwrite 2a"),
-                        ),
-                        (base_key_child, test_img("metadata key 2")),
-                        (base_key_overwrite, test_img("metadata key overwrite 2b")),
-                    ],
+                    vec![(base_key_child, test_img("metadata key 2"))],
                )], // image layers
                Lsn(0x30),
            )
@@ -7827,26 +7868,6 @@ mod tests {
            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
            None
        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
-            Some(test_img("metadata key overwrite 1b"))
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
-            Some(test_img("metadata inherited key 1"))
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
-            None
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
-            None
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
-            Some(test_img("metadata key overwrite 1a"))
-        );

        // test vectored get on child timeline
        assert_eq!(
@@ -7861,82 +7882,6 @@ mod tests {
            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
            None
        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
-            Some(test_img("metadata inherited key 1"))
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
-            Some(test_img("metadata inherited key 2"))
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
-            None
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
-            Some(test_img("metadata key overwrite 2b"))
-        );
-        assert_eq!(
-            get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
-            Some(test_img("metadata key overwrite 2a"))
-        );
-
-        // test vectored scan on parent timeline
-        let mut reconstruct_state = ValuesReconstructState::new();
-        let res = tline
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
-            .await?;
-
-        assert_eq!(
-            res.into_iter()
-                .map(|(k, v)| (k, v.unwrap()))
-                .collect::<Vec<_>>(),
-            vec![
-                (base_inherited_key, test_img("metadata inherited key 1")),
-                (
-                    base_inherited_key_overwrite,
-                    test_img("metadata key overwrite 1a")
-                ),
-                (base_key, test_img("metadata key 1")),
-                (base_key_overwrite, test_img("metadata key overwrite 1b")),
-            ]
-        );
-
-        // test vectored scan on child timeline
-        let mut reconstruct_state = ValuesReconstructState::new();
-        let res = child
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
-            .await?;
-
-        assert_eq!(
-            res.into_iter()
-                .map(|(k, v)| (k, v.unwrap()))
-                .collect::<Vec<_>>(),
-            vec![
-                (base_inherited_key, test_img("metadata inherited key 1")),
-                (
-                    base_inherited_key_child,
-                    test_img("metadata inherited key 2")
-                ),
-                (
-                    base_inherited_key_overwrite,
-                    test_img("metadata key overwrite 2a")
-                ),
-                (base_key_child, test_img("metadata key 2")),
-                (base_key_overwrite, test_img("metadata key overwrite 2b")),
-            ]
-        );

        Ok(())
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, TenantConfigPatch};
+use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                .map(humantime),
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
-            timeline_get_throttle: value.timeline_get_throttle,
+            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -84,17 +84,17 @@ impl Value {

    fn to_u64(self) -> u64 {
        let b = &self.0;
-        ((b[0] as u64) << 32)
-            | ((b[1] as u64) << 24)
-            | ((b[2] as u64) << 16)
-            | ((b[3] as u64) << 8)
+        (b[0] as u64) << 32
+            | (b[1] as u64) << 24
+            | (b[2] as u64) << 16
+            | (b[3] as u64) << 8
            | b[4] as u64
    }

    fn to_blknum(self) -> u32 {
        let b = &self.0;
        assert!(b[0] == 0x80);
-        ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
+        (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
    }
 }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -320,6 +320,7 @@ impl TimelineMetadata {

    // Checksums make it awkward to build a valid instance by hand.  This helper
    // provides a TimelineMetadata with a valid checksum in its header.
+    #[cfg(test)]
    pub fn example() -> Self {
        let instance = Self::new(
            "0/16960E8".parse::<Lsn>().unwrap(),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -63,18 +63,22 @@
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
 //! described above.
-//!
 //! From the user's perspective, the operations are executed sequentially.
 //! Internally, the client knows which operations can be performed in parallel,
 //! and which operations act like a "barrier" that require preceding operations
 //! to finish. The calling code just needs to call the schedule-functions in the
 //! correct order, and the client will parallelize the operations in a way that
-//! is safe. For more details, see `UploadOp::can_bypass`.
+//! is safe.
+//!
+//! The caller should be careful with deletion, though. They should not delete
+//! local files that have been scheduled for upload but not yet finished uploading.
+//! Otherwise the upload will fail. To wait for an upload to finish, use
+//! the 'wait_completion' function (more on that later.)
 //!
 //! All of this relies on the following invariants:
 //!
 //! - We rely on read-after write consistency in the remote storage.
-//! - Layer files are immutable.
+//! - Layer files are immutable
 //!
 //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
 //! storage. Different tenants can be attached to different pageservers, but if the
@@ -300,15 +304,6 @@ pub enum WaitCompletionError {
 #[derive(Debug, thiserror::Error)]
 #[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
 pub struct UploadQueueNotReadyError;
-
-#[derive(Debug, thiserror::Error)]
-pub enum ShutdownIfArchivedError {
-    #[error(transparent)]
-    NotInitialized(NotInitialized),
-    #[error("timeline is not archived")]
-    NotArchived,
-}
-
 /// Behavioral modes that enable seamless live migration.
 ///
 /// See docs/rfcs/028-pageserver-migration.md to understand how these fit in.
@@ -425,16 +420,8 @@ impl RemoteTimelineClient {
    /// an index file upload, i.e., it's not empty.
    /// The given `index_part` must be the one on the remote.
    pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
-        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
-        // certainly no point in starting more upload tasks than this.
-        let inprogress_limit = self
-            .conf
-            .remote_storage_config
-            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        info!(
            "initialized upload queue from remote index with {} layer files",
@@ -449,16 +436,8 @@ impl RemoteTimelineClient {
        &self,
        local_metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
-        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
-        // certainly no point in starting more upload tasks than this.
-        let inprogress_limit = self
-            .conf
-            .remote_storage_config
-            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        upload_queue.initialize_empty_remote(local_metadata)?;
        self.update_remote_physical_size_gauge(None);
        info!("initialized upload queue as empty");
        Ok(())
@@ -474,15 +453,9 @@ impl RemoteTimelineClient {
        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;
-        let inprogress_limit = self
-            .conf
-            .remote_storage_config
-            .as_ref()
-            .and_then(|r| r.concurrency_limit())
-            .unwrap_or(0);

        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        self.stop_impl(&mut upload_queue);

@@ -843,55 +816,6 @@ impl RemoteTimelineClient {
        Ok(need_wait)
    }

-    /// Shuts the timeline client down, but only if the timeline is archived.
-    ///
-    /// This function and [`Self::schedule_index_upload_for_timeline_archival_state`] use the
-    /// same lock to prevent races between unarchival and offloading: unarchival requires the
-    /// upload queue to be initialized, and leaves behind an upload queue where either dirty
-    /// or clean has archived_at of `None`. offloading leaves behind an uninitialized upload
-    /// queue.
-    pub(crate) async fn shutdown_if_archived(
-        self: &Arc<Self>,
-    ) -> Result<(), ShutdownIfArchivedError> {
-        {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard
-                .initialized_mut()
-                .map_err(ShutdownIfArchivedError::NotInitialized)?;
-
-            match (
-                upload_queue.dirty.archived_at.is_none(),
-                upload_queue.clean.0.archived_at.is_none(),
-            ) {
-                // The expected case: the timeline is archived and we don't want to unarchive
-                (false, false) => {}
-                (true, false) => {
-                    tracing::info!("can't shut down timeline: timeline slated for unarchival");
-                    return Err(ShutdownIfArchivedError::NotArchived);
-                }
-                (dirty_archived, true) => {
-                    tracing::info!(%dirty_archived, "can't shut down timeline: timeline not archived in remote storage");
-                    return Err(ShutdownIfArchivedError::NotArchived);
-                }
-            }
-
-            // Set the shutting_down flag while the guard from the archival check is held.
-            // This prevents a race with unarchival, as initialized_mut will not return
-            // an upload queue from this point.
-            // Also launch the queued tasks like shutdown() does.
-            if !upload_queue.shutting_down {
-                upload_queue.shutting_down = true;
-                upload_queue.queued_operations.push_back(UploadOp::Shutdown);
-                // this operation is not counted similar to Barrier
-                self.launch_queued_tasks(upload_queue);
-            }
-        }
-
-        self.shutdown().await;
-
-        Ok(())
-    }
-
    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
    pub(crate) fn schedule_index_upload_for_import_pgdata_state_update(
        self: &Arc<Self>,
@@ -1873,17 +1797,57 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
    ///
-    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
-    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
-    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
+    /// The caller needs to already hold the `upload_queue` lock.
    fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
-            debug!("starting op: {next_op}");
+        while let Some(next_op) = upload_queue.queued_operations.front() {
+            // Can we run this task now?
+            let can_run_now = match next_op {
+                UploadOp::UploadLayer(..) => {
+                    // Can always be scheduled.
+                    true
+                }
+                UploadOp::UploadMetadata { .. } => {
+                    // These can only be performed after all the preceding operations
+                    // have finished.
+                    upload_queue.inprogress_tasks.is_empty()
+                }
+                UploadOp::Delete(..) => {
+                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
+                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
+                }

-            // Prepare upload.
+                UploadOp::Barrier(_) | UploadOp::Shutdown => {
+                    upload_queue.inprogress_tasks.is_empty()
+                }
+            };
+
+            // If we cannot launch this task, don't look any further.
+            //
+            // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
+            // them now, but we don't try to do that currently.  For example, if the frontmost task
+            // is an index-file upload that cannot proceed until preceding uploads have finished, we
+            // could still start layer uploads that were scheduled later.
+            if !can_run_now {
+                break;
+            }
+
+            if let UploadOp::Shutdown = next_op {
+                // leave the op in the queue but do not start more tasks; it will be dropped when
+                // the stop is called.
+                upload_queue.shutdown_ready.close();
+                break;
+            }
+
+            // We can launch this task. Remove it from the queue first.
+            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
+
+            debug!("starting op: {}", next_op);
+
+            // Update the counters and prepare
            match &mut next_op {
                UploadOp::UploadLayer(layer, meta, mode) => {
                    if upload_queue
@@ -1894,14 +1858,18 @@ impl RemoteTimelineClient {
                    } else {
                        *mode = Some(OpType::MayReorder)
                    }
+                    upload_queue.num_inprogress_layer_uploads += 1;
+                }
+                UploadOp::UploadMetadata { .. } => {
+                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
-                UploadOp::UploadMetadata { .. } => {}
                UploadOp::Delete(Delete { layers }) => {
                    for (name, meta) in layers {
                        upload_queue
                            .recently_deleted
                            .insert((name.clone(), meta.generation));
                    }
+                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
                    sender.send_replace(());
@@ -1918,7 +1886,6 @@ impl RemoteTimelineClient {
            let task = Arc::new(UploadTask {
                task_id: upload_task_id,
                op: next_op,
-                coalesced_ops,
                retries: AtomicU32::new(0),
            });
            upload_queue
@@ -1976,34 +1943,8 @@ impl RemoteTimelineClient {
                return;
            }

-            // Assert that we don't modify a layer that's referenced by the current index.
-            if cfg!(debug_assertions) {
-                let modified = match &task.op {
-                    UploadOp::UploadLayer(layer, layer_metadata, _) => {
-                        vec![(layer.layer_desc().layer_name(), layer_metadata)]
-                    }
-                    UploadOp::Delete(delete) => {
-                        delete.layers.iter().map(|(n, m)| (n.clone(), m)).collect()
-                    }
-                    // These don't modify layers.
-                    UploadOp::UploadMetadata { .. } => Vec::new(),
-                    UploadOp::Barrier(_) => Vec::new(),
-                    UploadOp::Shutdown => Vec::new(),
-                };
-                if let Ok(queue) = self.upload_queue.lock().unwrap().initialized_mut() {
-                    for (ref name, metadata) in modified {
-                        debug_assert!(
-                            !queue.clean.0.references(name, metadata),
-                            "layer {name} modified while referenced by index",
-                        );
-                    }
-                }
-            }
-
            let upload_result: anyhow::Result<()> = match &task.op {
                UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
-                    // TODO: check if this mechanism can be removed now that can_bypass() performs
-                    // conflict checks during scheduling.
                    if let Some(OpType::FlushDeletion) = mode {
                        if self.config.read().unwrap().block_deletions {
                            // Of course, this is not efficient... but usually the queue should be empty.
@@ -2226,8 +2167,13 @@ impl RemoteTimelineClient {
            upload_queue.inprogress_tasks.remove(&task.task_id);

            let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _, _) => None,
+                UploadOp::UploadLayer(_, _, _) => {
+                    upload_queue.num_inprogress_layer_uploads -= 1;
+                    None
+                }
                UploadOp::UploadMetadata { ref uploaded } => {
+                    upload_queue.num_inprogress_metadata_uploads -= 1;
+
                    // the task id is reused as a monotonicity check for storing the "clean"
                    // IndexPart.
                    let last_updater = upload_queue.clean.1;
@@ -2261,7 +2207,10 @@ impl RemoteTimelineClient {
                        None
                    }
                }
-                UploadOp::Delete(_) => None,
+                UploadOp::Delete(_) => {
+                    upload_queue.num_inprogress_deletions -= 1;
+                    None
+                }
                UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
            };

@@ -2286,9 +2235,6 @@ impl RemoteTimelineClient {
        }

        self.metric_end(&task.op);
-        for coalesced_op in &task.coalesced_ops {
-            self.metric_end(coalesced_op);
-        }
    }

    fn metric_impl(
@@ -2381,7 +2327,6 @@ impl RemoteTimelineClient {
                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                    let upload_queue_for_deletion = UploadQueueInitialized {
-                        inprogress_limit: initialized.inprogress_limit,
                        task_counter: 0,
                        dirty: initialized.dirty.clone(),
                        clean: initialized.clean.clone(),
@@ -2389,6 +2334,9 @@ impl RemoteTimelineClient {
                        visible_remote_consistent_lsn: initialized
                            .visible_remote_consistent_lsn
                            .clone(),
+                        num_inprogress_layer_uploads: 0,
+                        num_inprogress_metadata_uploads: 0,
+                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                        #[cfg(feature = "testing")]
@@ -2415,6 +2363,14 @@ impl RemoteTimelineClient {
                    }
                };

+                // consistency check
+                assert_eq!(
+                    qi.num_inprogress_layer_uploads
+                        + qi.num_inprogress_metadata_uploads
+                        + qi.num_inprogress_deletions,
+                    qi.inprogress_tasks.len()
+                );
+
                // We don't need to do anything here for in-progress tasks. They will finish
                // on their own, decrement the unfinished-task counter themselves, and observe
                // that the queue is Stopped.
@@ -2553,21 +2509,6 @@ pub fn remote_layer_path(
    RemotePath::from_string(&path).expect("Failed to construct path")
 }

-/// Returns true if a and b have the same layer path within a tenant/timeline. This is essentially
-/// remote_layer_path(a) == remote_layer_path(b) without the string allocations.
-///
-/// TODO: there should be a variant of LayerName for the physical path that contains information
-/// about the shard and generation, such that this could be replaced by a simple comparison.
-pub fn is_same_remote_layer_path(
-    aname: &LayerName,
-    ameta: &LayerFileMetadata,
-    bname: &LayerName,
-    bmeta: &LayerFileMetadata,
-) -> bool {
-    // NB: don't assert remote_layer_path(a) == remote_layer_path(b); too expensive even for debug.
-    aname == bname && ameta.shard == bmeta.shard && ameta.generation == bmeta.generation
-}
-
 pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
    RemotePath::from_string(&format!(
        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
@@ -2861,8 +2802,8 @@ mod tests {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
            assert!(upload_queue.queued_operations.is_empty());
-            assert_eq!(upload_queue.inprogress_tasks.len(), 2);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);
+            assert!(upload_queue.inprogress_tasks.len() == 2);
+            assert!(upload_queue.num_inprogress_layer_uploads == 2);

            // also check that `latest_file_changes` was updated
            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
@@ -2932,8 +2873,8 @@ mod tests {
            // Deletion schedules upload of the index file, and the file deletion itself
            assert_eq!(upload_queue.queued_operations.len(), 2);
            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
-            assert_eq!(upload_queue.num_inprogress_deletions(), 0);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
+            assert_eq!(upload_queue.num_inprogress_deletions, 0);
            assert_eq!(
                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
                0
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -8,14 +8,14 @@ use std::collections::HashMap;
 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
 use serde::{Deserialize, Serialize};
+use utils::id::TimelineId;

-use super::is_same_remote_layer_path;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::import_pgdata;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
-use utils::id::TimelineId;
+
 use utils::lsn::Lsn;

 /// In-memory representation of an `index_part.json` file
@@ -45,8 +45,10 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub import_pgdata: Option<import_pgdata::index_part_format::Root>,

-    /// Layer filenames and metadata. For an index persisted in remote storage, all layers must
-    /// exist in remote storage.
+    /// Per layer file name metadata, which can be present for a present or missing layer file.
+    ///
+    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
+    /// that latest version stores.
    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,

    /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
@@ -104,7 +106,7 @@ impl IndexPart {

    pub const FILE_NAME: &'static str = "index_part.json";

-    pub fn empty(metadata: TimelineMetadata) -> Self {
+    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
        IndexPart {
            version: Self::LATEST_VERSION,
            layer_metadata: Default::default(),
@@ -141,17 +143,6 @@ impl IndexPart {
    pub(crate) fn example() -> Self {
        Self::empty(TimelineMetadata::example())
    }
-
-    /// Returns true if the index contains a reference to the given layer (i.e. file path).
-    ///
-    /// TODO: there should be a variant of LayerName for the physical remote path that contains
-    /// information about the shard and generation, to avoid passing in metadata.
-    pub fn references(&self, name: &LayerName, metadata: &LayerFileMetadata) -> bool {
-        let Some(index_metadata) = self.layer_metadata.get(name) else {
-            return false;
-        };
-        is_same_remote_layer_path(name, metadata, name, index_metadata)
-    }
 }

 /// Metadata gathered for each of the layer files.
--- a/Show More
+++ b/Show More