feat(pageserver): use vectored_get in collect_keyspace

Signed-off-by: Alex Chi Z <chi@neon.tech>
storcon: signal LSN wait to pageserver during live migration (#10452 )
2026-02-07 12:40:38 +00:00 · 2025-01-28 20:31:15 +01:00 · 2025-01-28 17:33:07 +00:00 · 2025-01-28 17:21:05 +00:00 · 2025-01-28 17:08:17 +00:00 · 2025-01-28 15:32:59 +00:00
315 changed files with 20158 additions and 7407 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -25,3 +25,4 @@ config-variables:
  - PGREGRESS_PG17_PROJECT_ID
  - SLACK_ON_CALL_QA_STAGING_STREAM
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
+  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -17,6 +17,31 @@ inputs:
  compute_units:
    description: '[Min, Max] compute units'
    default: '[1, 1]'
+  # settings below only needed if you want the project to be sharded from the beginning
+  shard_split_project:
+    description: 'by default new projects are not shard-split, specify true to shard-split'
+    required: false
+    default: 'false'
+  admin_api_key:
+    description: 'Admin API Key needed for shard-splitting. Must be specified if shard_split_project is true'
+    required: false
+  shard_count:
+    description: 'Number of shards to split the project into, only applies if shard_split_project is true'
+    required: false
+    default: '8'
+  stripe_size:
+    description: 'Stripe size, optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes. Default is 128 MiB, only applies if shard_split_project is true'
+    required: false
+    default: '32768'
+  psql_path:
+    description: 'Path to psql binary - it is caller responsibility to provision the psql binary'
+    required: false
+    default: '/tmp/neon/pg_install/v16/bin/psql'
+  libpq_lib_path:
+    description: 'Path to directory containing libpq library - it is caller responsibility to provision the libpq library'
+    required: false
+    default: '/tmp/neon/pg_install/v16/lib'
+  

 outputs:
  dsn:
@@ -63,6 +88,23 @@ runs:
        echo "project_id=${project_id}" >> $GITHUB_OUTPUT

        echo "Project ${project_id} has been created"
+
+        if [ "${SHARD_SPLIT_PROJECT}" = "true" ]; then
+          # determine tenant ID
+          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
+          
+          echo "Splitting project ${project_id} with tenant_id ${TENANT_ID} into $((SHARD_COUNT)) shards with stripe size $((STRIPE_SIZE))"
+
+          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split"
+          echo "with body {\"new_shard_count\": $((SHARD_COUNT)), \"new_stripe_size\": $((STRIPE_SIZE))}"
+          
+          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
+          curl -X PUT \
+            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/shard_split" \
+            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
+            -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
+        fi
+
      env:
        API_HOST: ${{ inputs.api_host }}
        API_KEY: ${{ inputs.api_key }}
@@ -70,3 +112,9 @@ runs:
        POSTGRES_VERSION: ${{ inputs.postgres_version }}
        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
+        SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
+        ADMIN_API_KEY: ${{ inputs.admin_api_key }}
+        SHARD_COUNT: ${{ inputs.shard_count }}
+        STRIPE_SIZE: ${{ inputs.stripe_size }}
+        PSQL: ${{ inputs.psql_path }}
+        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -158,8 +158,6 @@ jobs:

      - name: Run cargo build
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

      # Do install *before* running rust tests because they might recompile the
@@ -217,8 +215,6 @@ jobs:
        env:
          NEXTEST_RETRIES: 3
        run: |
-          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
-          export PQ_LIB_DIR
          LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
          export LD_LIBRARY_PATH

@@ -229,8 +225,13 @@ jobs:
          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'

          # run pageserver tests with different settings
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+          for get_vectored_concurrent_io in sequential sidecar-task; do
+            for io_engine in std-fs tokio-epoll-uring ; do
+              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                ${cov_prefix} \
+                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+            done
          done

          # Run separate tests for real S3
@@ -314,6 +315,7 @@ jobs:
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -0,0 +1,91 @@
+name: Check Codestyle Rust
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        description: "build-tools image"
+        required: true
+        type: string
+      archs:
+        description: "Json array of architectures to run on"
+        type: string
+
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  check-codestyle-rust:
+    strategy:
+      matrix:
+        arch: ${{ fromJson(inputs.archs) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Cache cargo deps
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            !~/.cargo/registry/src
+            ~/.cargo/git
+            target
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      #
+      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
+      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
+      # time just for that, so skip "clippy --release".
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+          RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check --hide-inclusion-graph
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -235,7 +235,7 @@ jobs:
          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV

      - name: Run cargo build (only for v17)
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release -j$(sysctl -n hw.ncpu)
+        run: cargo build --all --release -j$(sysctl -n hw.ncpu)

      - name: Check that no warnings are produced (only for v17)
        run: ./run_clippy.sh
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -164,77 +164,11 @@ jobs:

  check-codestyle-rust:
    needs: [ check-permissions, build-build-tools-image ]
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-
-      - name: Cache cargo deps
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            !~/.cargo/registry/src
-            ~/.cargo/git
-            target
-          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
-
-      # Some of our rust modules use FFI and need those to be checked
-      - name: Get postgres headers
-        run: make postgres-headers -j$(nproc)
-
-      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
-      # This will catch compiler & clippy warnings in all feature combinations.
-      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
-      # NB: keep clippy args in sync with ./run_clippy.sh
-      #
-      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
-      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
-      # time just for that, so skip "clippy --release".
-      - run: |
-          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
-          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
-            echo "No clippy args found in .neon_clippy_args"
-            exit 1
-          fi
-          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
-      - name: Run cargo clippy (debug)
-        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
-
-      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
-      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
-      - name: Check formatting
-        if: ${{ !cancelled() }}
-        run: cargo fmt --all -- --check
-
-      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
-      - name: Check rust dependencies
-        if: ${{ !cancelled() }}
-        run: |
-          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
-          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
+    uses: ./.github/workflows/_check-codestyle-rust.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      archs: '["x64", "arm64"]'
+    secrets: inherit

  build-and-test-locally:
    needs: [ tag, build-build-tools-image ]
@@ -346,25 +280,22 @@ jobs:
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

-  report-benchmarks-failures:
+  report-benchmarks-results-to-slack:
    needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: write
-      pull-requests: write
+    if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result)
    runs-on: ubuntu-22.04

    steps:
-    - uses: slackapi/slack-github-action@v1
+    - uses: slackapi/slack-github-action@v2
      with:
-        channel-id: C060CNA47S9 # on-call-staging-storage-stream
-        slack-message: |
-          Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
-          <${{ needs.create-test-report.outputs.report-url }}|Allure report>
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+        method: chat.postMessage
+        token: ${{ secrets.SLACK_BOT_TOKEN }}
+        payload: |
+          channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}"
+          text: |
+            Benchmarks on main: *${{ needs.benchmarks.result }}*
+            - <${{ needs.create-test-report.outputs.report-url }}|Allure report>
+            - <${{ github.event.head_commit.url }}|${{ github.sha }}>

  create-test-report:
    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
@@ -728,30 +659,6 @@ jobs:
          tags: |
            neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}

-      - name: Build compute-tools image
-        # compute-tools are Postgres independent, so build it only once
-        # We pick 16, because that builds on debian 11 with older glibc (and is
-        # thus compatible with newer glibc), rather than 17 on Debian 12, as
-        # that isn't guaranteed to be compatible with Debian 11
-        if: matrix.version.pg == 'v16'
-        uses: docker/build-push-action@v6
-        with:
-          target: compute-tools-image
-          context: .
-          build-args: |
-            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
-            DEBIAN_VERSION=${{ matrix.version.debian }}
-          provenance: false
-          push: true
-          pull: true
-          file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
-          tags: |
-            neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
-
  compute-node-image:
    needs: [ compute-node-image-arch, tag ]
    permissions:
@@ -794,14 +701,6 @@ jobs:
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

-      - name: Create multi-arch compute-tools image
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                          -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
-                                             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
-
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
@@ -817,12 +716,6 @@ jobs:
          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                                                                neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}

-      - name: Push multi-arch compute-tools image to ECR
-        if: matrix.version.pg == 'v16'
-        run: |
-          docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \
-                                                                                neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}
-
  vm-compute-node-image:
    needs: [ check-permissions, tag, compute-node-image ]
    runs-on: [ self-hosted, large ]
@@ -927,11 +820,11 @@ jobs:
      - name: Print logs and clean up
        if: always()
        run: |
-          docker compose -f ./docker-compose/docker-compose.yml logs || 0
-          docker compose -f ./docker-compose/docker-compose.yml down
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down

  promote-images-dev:
-    needs: [ check-permissions, tag, vm-compute-node-image ]
+    needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
    runs-on: ubuntu-22.04

    permissions:
@@ -966,7 +859,7 @@ jobs:
          done

  promote-images-prod:
-    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
+    needs: [ check-permissions, tag, test-images, promote-images-dev ]
    runs-on: ubuntu-22.04
    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'

@@ -1001,9 +894,6 @@ jobs:
            docker buildx imagetools create -t $repo/neon:latest \
                                               $repo/neon:${{ needs.tag.outputs.build-tag }}

-            docker buildx imagetools create -t $repo/compute-tools:latest \
-                                               $repo/compute-tools:${{ needs.tag.outputs.build-tag }}
-
            for version in ${VERSIONS}; do
              docker buildx imagetools create -t $repo/compute-node-${version}:latest \
                                                 $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }}
@@ -1032,7 +922,7 @@ jobs:
      - name: Copy all images to prod ECR
        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
        run: |
-          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
+          for image in neon {vm-,}compute-node-{v14,v15,v16,v17}; do
            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
          done
@@ -1044,7 +934,7 @@ jobs:
    with:
      client_id: ${{ vars.AZURE_DEV_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
      registry_name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1056,7 +946,7 @@ jobs:
    with:
      client_id: ${{ vars.AZURE_PROD_CLIENT_ID }}
      image_tag: ${{ needs.tag.outputs.build-tag }}
-      images: neon compute-tools vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
+      images: neon vm-compute-node-v14 vm-compute-node-v15 vm-compute-node-v16 vm-compute-node-v17 compute-node-v14 compute-node-v15 compute-node-v16 compute-node-v17
      registry_name: ${{ vars.AZURE_PROD_REGISTRY_NAME }}
      subscription_id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
      tenant_id: ${{ vars.AZURE_TENANT_ID }}
@@ -1188,12 +1078,6 @@ jobs:
              console.log(`Tag ${tag} created successfully.`);
            }

-            // TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
-            if (context.ref !== 'refs/heads/release') {
-              console.log(`GitHub release skipped for ${context.ref}.`);
-              return;
-            }
-
            try {
              const existingRelease = await github.rest.repos.getReleaseByTag({
                owner: context.repo.owner,
@@ -1212,7 +1096,8 @@ jobs:
                owner: context.repo.owner,
                repo: context.repo.repo,
                tag_name: tag,
-                generate_release_notes: true,
+                // TODO: Automate release notes properly
+                generate_release_notes: false,
              });
              console.log(`Release for tag ${tag} created successfully.`);
            }
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -28,7 +28,24 @@ jobs:
    strategy:
      fail-fast: false # allow other variants to continue even if one fails
      matrix:
-        target_project: [new_empty_project, large_existing_project]
+        include:
+          - target_project: new_empty_project_stripe_size_2048 
+            stripe_size: 2048 # 16 MiB
+            postgres_version: 16
+          - target_project: new_empty_project_stripe_size_32768
+            stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
+                               # while here it is sharded from the beginning with a shard size of 256 MiB
+            postgres_version: 16
+          - target_project: new_empty_project
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            postgres_version: 16
+          - target_project: new_empty_project
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            postgres_version: 17
+          - target_project: large_existing_project
+            stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
+            postgres_version: 16
+      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
    permissions:
      contents: write
      statuses: write
@@ -67,17 +84,21 @@ jobs:
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Neon Project
-      if: ${{ matrix.target_project == 'new_empty_project' }}
+      if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
      id: create-neon-project-ingest-target
      uses: ./.github/actions/neon-project-create
      with:
        region_id: aws-us-east-2
-        postgres_version: 16
+        postgres_version: ${{ matrix.postgres_version }}
        compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        shard_split_project: ${{ matrix.stripe_size != null && 'true' || 'false' }}
+        admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} 
+        shard_count: 8
+        stripe_size: ${{ matrix.stripe_size }}

    - name: Initialize Neon project
-      if: ${{ matrix.target_project == 'new_empty_project' }}
+      if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
      env:
          BENCHMARK_INGEST_TARGET_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
          NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
@@ -130,7 +151,7 @@ jobs:
        test_selection: performance/test_perf_ingest_using_pgcopydb.py
        run_in_parallel: false
        extra_params: -s -m remote_cluster --timeout 86400 -k test_ingest_performance_using_pgcopydb
-        pg_version: v16
+        pg_version: v${{ matrix.postgres_version }}
        save_perf_report: true
        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
      env:
@@ -146,7 +167,7 @@ jobs:
        ${PSQL} "${BENCHMARK_INGEST_TARGET_CONNSTR}" -c "\dt+"

    - name: Delete Neon Project
-      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
+      if: ${{ always() && startsWith(matrix.target_project, 'new_empty_project') }}
      uses: ./.github/actions/neon-project-delete
      with:
        project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -114,7 +114,7 @@ jobs:
        run: make walproposer-lib -j$(nproc)

      - name: Produce the build stats
-        run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
+        run: cargo build --all --release --timings -j$(nproc)

      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -1,6 +1,12 @@
 name: Pre-merge checks

 on:
+  pull_request:
+    paths:
+      - .github/workflows/_check-codestyle-python.yml
+      - .github/workflows/_check-codestyle-rust.yml
+      - .github/workflows/build-build-tools-image.yml
+      - .github/workflows/pre-merge-checks.yml
  merge_group:
    branches:
      - main
@@ -17,8 +23,10 @@ jobs:
    runs-on: ubuntu-22.04
    outputs:
      python-changed: ${{ steps.python-src.outputs.any_changed }}
+      rust-changed: ${{ steps.rust-src.outputs.any_changed }}
    steps:
      - uses: actions/checkout@v4
+
      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
        id: python-src
        with:
@@ -30,11 +38,25 @@ jobs:
            poetry.lock
            pyproject.toml

+      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
+        id: rust-src
+        with:
+          files: |
+            .github/workflows/_check-codestyle-rust.yml
+            .github/workflows/build-build-tools-image.yml
+            .github/workflows/pre-merge-checks.yml
+            **/**.rs
+            **/Cargo.toml
+            Cargo.toml
+            Cargo.lock
+
      - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
        env:
          PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
+          RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
        run: |
          echo "${PYTHON_CHANGED_FILES}"
+          echo "${RUST_CHANGED_FILES}"

  build-build-tools-image:
    if: needs.get-changed-files.outputs.python-changed == 'true'
@@ -55,6 +77,16 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
    secrets: inherit

+  check-codestyle-rust:
+    if: needs.get-changed-files.outputs.rust-changed == 'true'
+    needs: [ get-changed-files, build-build-tools-image ]
+    uses: ./.github/workflows/_check-codestyle-rust.yml
+    with:
+      # `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
+      archs: '["x64"]'
+    secrets: inherit
+
  # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
  # Currently we require 2 jobs (checks with exact name):
  # - conclusion
@@ -67,6 +99,7 @@ jobs:
    needs:
      - get-changed-files
      - check-codestyle-python
+      - check-codestyle-rust
    runs-on: ubuntu-22.04
    steps:
      - name: Create fake `neon-cloud-e2e` check
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -3,8 +3,9 @@ name: Create Release Branch
 on:
  schedule:
    # It should be kept in sync with if-condition in jobs
-    - cron: '0 6 * * FRI' # Storage release
    - cron: '0 6 * * THU' # Proxy release
+    - cron: '0 6 * * FRI' # Storage release
+    - cron: '0 7 * * FRI' # Compute release
  workflow_dispatch:
    inputs:
      create-storage-release-branch:
@@ -55,7 +56,7 @@ jobs:
      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}

  create-compute-release-branch:
-    if: inputs.create-compute-release-branch
+    if: ${{ github.event.schedule == '0 7 * * FRI' || inputs.create-compute-release-branch }}

    permissions:
      contents: write
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ aws-smithy-types = "1.2"
 aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
-axum = { version = "0.7.5", features = ["ws"] }
+axum = { version = "0.8.1", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.70"
@@ -110,6 +110,7 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
+inferno = "0.12.0"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
@@ -126,10 +127,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.26"
-opentelemetry_sdk = "0.26"
-opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.26"
+opentelemetry = "0.27"
+opentelemetry_sdk = "0.27"
+opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.27"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -143,7 +144,7 @@ rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
 reqwest-middleware = "0.4"
 reqwest-retry = "0.7"
 routerify = "3"
@@ -186,11 +187,13 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
-tower-service = "0.3.2"
+tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
+tower = { version = "0.5.2", default-features = false }
+tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
+tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
-tracing-opentelemetry = "0.27"
+tracing-opentelemetry = "0.28"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }
--- a/8
+++ b/8
@@ -45,7 +45,7 @@ COPY --chown=nonroot . .

 ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
-    && PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
@@ -71,6 +71,7 @@ RUN set -e \
        ca-certificates \
 	# System postgres for use with client libraries (e.g. in storage controller)
        postgresql-15 \
+        openssl \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && useradd -d /data neon \
    && chown -R neon:neon /data
@@ -103,11 +104,6 @@ RUN mkdir -p /data/.neon/ && \
  > /data/.neon/pageserver.toml && \
  chown -R neon:neon /data/.neon

-# When running a binary that links with libpq, default to using our most recent postgres version.  Binaries
-# that want a particular postgres version will select it explicitly: this is just a default.
-ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
-
-
 VOLUME ["/data"]
 USER neon
 EXPOSE 6400
--- a/5
+++ b/5
@@ -3,7 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

-OPENSSL_PREFIX_DIR := /usr/local/openssl
 ICU_PREFIX_DIR := /usr/local/icu

 #
@@ -26,11 +25,9 @@ endif
 ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
 	# Exclude static build openssl, icu for local build (MacOS, Linux)
 	# Only keep for build type release and debug
-	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
 	PG_CONFIGURE_OPTS += --with-icu
 	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
 	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
-	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
 endif

 UNAME_S := $(shell uname -s)
@@ -67,8 +64,6 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
-# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
-CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

 CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"

--- a/README.md
+++ b/README.md
@@ -21,8 +21,10 @@ The Neon storage engine consists of two major components:

 See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.

-## Running local installation
+## Running a local development environment

+Neon can be run on a workstation for small experiments and to test code changes, by
+following these instructions.

 #### Installing dependencies on Linux
 1. Install build dependencies and other applicable packages
@@ -238,7 +240,7 @@ postgres=# select * from t;
 > cargo neon stop
 ```

-More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
+More advanced usages can be found at [Local Development Control Plane (`neon_local`))](./control_plane/README.md).

 #### Handling build failures

--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \

 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.16.0
+ENV SQL_EXPORTER_VERSION=0.17.0
 RUN curl -fsSL \
    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
    --output sql_exporter.tar.gz \
@@ -190,21 +190,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
    && make install \
    && rm -rf ../lcov.tar.gz

-# Compile and install the static OpenSSL library
-ENV OPENSSL_VERSION=1.1.1w
-ENV OPENSSL_PREFIX=/usr/local/openssl
-RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
-    echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
-    cd /tmp && \
-    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
-    cd /tmp/openssl-${OPENSSL_VERSION} && \
-    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
-    make -j "$(nproc)" && \
-    make install && \
-    cd /tmp && \
-    rm -rf /tmp/openssl-${OPENSSL_VERSION}
-
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #
@@ -258,7 +243,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.83.0
+ENV RUSTC_VERSION=1.84.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -66,6 +66,10 @@ RUN cd postgres && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
    # Enable some of contrib extensions
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \
+    file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \
+    echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
@@ -104,16 +108,18 @@ RUN cd postgres && \
        esac; \
    done;

+# Set PATH for all the subsequent build steps
+ENV PATH="/usr/local/pgsql/bin:$PATH"
+
 #########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
 #########################################################################################
-FROM build-deps AS postgis-build
+FROM pg-build AS postgis-build
 ARG DEBIAN_VERSION
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y \
    gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
@@ -151,8 +157,6 @@ RUN case "${DEBIAN_VERSION}" in \
    DESTDIR=/sfcgal ninja install -j $(getconf _NPROCESSORS_ONLN) && \
    ninja clean && cp -R /sfcgal/* /

-ENV PATH="/usr/local/pgsql/bin:$PATH"
-
 # Postgis 3.5.0 supports v17
 RUN case "${PG_VERSION}" in \
    "v17") \
@@ -170,7 +174,6 @@ RUN case "${PG_VERSION}" in \
    wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
    echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
    mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
    ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -220,11 +223,7 @@ RUN case "${PG_VERSION}" in \
    cmake -GNinja -DCMAKE_BUILD_TYPE=Release .. && \
    ninja -j $(getconf _NPROCESSORS_ONLN) && \
    ninja -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control

 #########################################################################################
 #
@@ -232,9 +231,8 @@ RUN case "${PG_VERSION}" in \
 # Build plv8
 #
 #########################################################################################
-FROM build-deps AS plv8-build
+FROM pg-build AS plv8-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch

@@ -269,7 +267,6 @@ RUN case "${PG_VERSION}" in \
    # generate and copy upgrade scripts
    mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
    cp upgrade/* /usr/local/pgsql/share/extension/ && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
    rm -rf /plv8-* && \
    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
@@ -296,9 +293,8 @@ RUN case "${PG_VERSION}" in \
 # Build h3_pg
 #
 #########################################################################################
-FROM build-deps AS h3-pg-build
+FROM pg-build AS h3-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v4.1.0 - Jan 18, 2023
@@ -319,7 +315,6 @@ RUN mkdir -p /h3/usr/ && \
 RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
    mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
@@ -331,17 +326,16 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3
 # compile unit extension
 #
 #########################################################################################
-FROM build-deps AS unit-pg-build
+FROM pg-build AS unit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release 7.9 - Sep 15, 2024
 RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -O postgresql-unit.tar.gz && \
    echo "e46de6245dcc8b2c2ecf29873dbd43b2b346773f31dd5ce4b8315895a052b456 postgresql-unit.tar.gz" | sha256sum --check && \
    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
    # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
    # This one-liner removes pgsql/ part of the path.
@@ -355,9 +349,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.9.tar.gz -
 # compile pgvector extension
 #
 #########################################################################################
-FROM build-deps AS vector-pg-build
+FROM pg-build AS vector-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY compute/patches/pgvector.patch /pgvector.patch

@@ -370,9 +363,11 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
    echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    wget https://github.com/pgvector/pgvector/raw/refs/tags/v0.7.4/sql/vector.sql -O ./sql/vector--0.7.4.sql && \
+    echo "10218d05dc02299562252a9484775178b14a1d8edb92a2d1672ef488530f7778 ./sql/vector--0.7.4.sql" | sha256sum --check && \
    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

 #########################################################################################
@@ -381,16 +376,15 @@ RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O
 # compile pgjwt extension
 #
 #########################################################################################
-FROM build-deps AS pgjwt-pg-build
+FROM pg-build AS pgjwt-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
    echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
    mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control

 #########################################################################################
@@ -399,17 +393,16 @@ RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71
 # compile hypopg extension
 #
 #########################################################################################
-FROM build-deps AS hypopg-pg-build
+FROM pg-build AS hypopg-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # HypoPG 1.4.1 supports v17
 # last release 1.4.1 - Apr 28, 2024
 RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
    echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
    mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control

 #########################################################################################
@@ -418,17 +411,16 @@ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypo
 # compile pg_hashids extension
 #
 #########################################################################################
-FROM build-deps AS pg-hashids-pg-build
+FROM pg-build AS pg-hashids-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v1.2.1 -Jan 12, 2018
 RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
    echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \
    mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control

 #########################################################################################
@@ -437,9 +429,8 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
 # compile rum extension
 #
 #########################################################################################
-FROM build-deps AS rum-pg-build
+FROM pg-build AS rum-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 COPY compute/patches/rum.patch /rum.patch

@@ -450,8 +441,8 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
    echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
    mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
    patch -p1 < /rum.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control

 #########################################################################################
@@ -460,17 +451,16 @@ RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0fea
 # compile pgTAP extension
 #
 #########################################################################################
-FROM build-deps AS pgtap-pg-build
+FROM pg-build AS pgtap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # pgtap 1.3.3 supports v17
 # last release v1.3.3 - Apr 8, 2024
 RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
    echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
    mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control

 #########################################################################################
@@ -479,17 +469,16 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgta
 # compile ip4r extension
 #
 #########################################################################################
-FROM build-deps AS ip4r-pg-build
+FROM pg-build AS ip4r-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v2.4.2 - Jul 29, 2023
 RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
    mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control

 #########################################################################################
@@ -498,17 +487,16 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 # compile Prefix extension
 #
 #########################################################################################
-FROM build-deps AS prefix-pg-build
+FROM pg-build AS prefix-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v1.2.10  - Jul 5, 2023
 RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
    mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control

 #########################################################################################
@@ -517,17 +505,16 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 # compile hll extension
 #
 #########################################################################################
-FROM build-deps AS hll-pg-build
+FROM pg-build AS hll-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v2.18 - Aug 29, 2023
 RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
    mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control

 #########################################################################################
@@ -536,17 +523,16 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 # compile plpgsql_check extension
 #
 #########################################################################################
-FROM build-deps AS plpgsql-check-pg-build
+FROM pg-build AS plpgsql-check-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # plpgsql_check v2.7.11 supports v17
 # last release v2.7.11 - Sep 16, 2024
 RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
    echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control

 #########################################################################################
@@ -555,11 +541,8 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz
 # compile timescaledb extension
 #
 #########################################################################################
-FROM build-deps AS timescaledb-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS timescaledb-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"

 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
@@ -590,11 +573,8 @@ RUN case "${PG_VERSION}" in \
 # compile pg_hint_plan extension
 #
 #########################################################################################
-FROM build-deps AS pg-hint-plan-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
+FROM pg-build AS pg-hint-plan-pg-build
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin:$PATH"

 # version-specific, has separate releases for each version
 RUN case "${PG_VERSION}" in \
@@ -632,14 +612,12 @@ RUN case "${PG_VERSION}" in \
 # compile pg_cron extension
 #
 #########################################################################################
-FROM build-deps AS pg-cron-pg-build
+FROM pg-build AS pg-cron-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
    echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
    mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
@@ -653,9 +631,8 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O
 # compile rdkit extension
 #
 #########################################################################################
-FROM build-deps AS rdkit-pg-build
+FROM pg-build AS rdkit-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y \
@@ -673,7 +650,13 @@ RUN apt update && \
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
 # https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
-ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
+
+# XXX: /usr/local/pgsql/bin is already in PATH, and that should be enough to find
+# pg_config. For some reason the rdkit cmake script doesn't work with just that,
+# however. By also adding /usr/local/pgsql, it works, which is weird because there
+# are no executables in that directory.
+ENV PATH="/usr/local/pgsql:$PATH"
+
 RUN case "${PG_VERSION}" in \
    "v17") \
        export RDKIT_VERSION=Release_2024_09_1 \
@@ -726,13 +709,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_uuidv7 extension
 #
 #########################################################################################
-FROM build-deps AS pg-uuidv7-pg-build
+FROM pg-build AS pg-uuidv7-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v1.6.0 - Oct 9, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
    echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
    mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
@@ -746,13 +727,11 @@ RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz
 # compile pg_roaringbitmap extension
 #
 #########################################################################################
-FROM build-deps AS pg-roaringbitmap-pg-build
+FROM pg-build AS pg-roaringbitmap-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # not version-specific
 # last release v0.5.4 - Jun 28, 2022
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \
    echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \
    mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \
@@ -766,16 +745,14 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
 # compile pg_semver extension
 #
 #########################################################################################
-FROM build-deps AS pg-semver-pg-build
+FROM pg-build AS pg-semver-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # Release 0.40.0 breaks backward compatibility with previous versions
 # see release note https://github.com/theory/pg-semver/releases/tag/v0.40.0
 # Use new version only for v17
 #
 # last release v0.40.0 - Jul 22, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
    "v17") \
        export SEMVER_VERSION=0.40.0 \
@@ -802,13 +779,11 @@ RUN case "${PG_VERSION}" in \
 # compile pg_embedding extension
 #
 #########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+FROM pg-build AS pg-embedding-pg-build

 # This is our extension, support stopped in favor of pgvector
 # TODO: deprecate it
 ARG PG_VERSION
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
        export PG_EMBEDDING_VERSION=0.3.5 \
@@ -829,26 +804,19 @@ RUN case "${PG_VERSION}" in \
 # compile anon extension
 #
 #########################################################################################
-FROM build-deps AS pg-anon-pg-build
+FROM pg-build AS pg-anon-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # This is an experimental extension, never got to real production.
 # !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
    esac && \
    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
-    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
-    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
-    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control

 #########################################################################################
 #
@@ -856,9 +824,8 @@ RUN case "${PG_VERSION}" in "v17") \
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build
+FROM pg-build AS rust-extensions-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -866,7 +833,7 @@ RUN apt update && \
    useradd -ms /bin/bash nonroot -b /home

 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot

@@ -893,9 +860,8 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM build-deps AS rust-extensions-build-pgrx12
+FROM pg-build AS rust-extensions-build-pgrx12
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
@@ -903,7 +869,7 @@ RUN apt update && \
    useradd -ms /bin/bash nonroot -b /home

 ENV HOME=/home/nonroot
-ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
 USER nonroot
 WORKDIR /home/nonroot

@@ -911,7 +877,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
-    cargo install --locked --version 0.12.6 cargo-pgrx && \
+    cargo install --locked --version 0.12.9 cargo-pgrx && \
    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root
@@ -948,19 +914,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
    \
    cd exts/rag && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
    \
    cd ../rag_bge_small_en_v15 && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
        cargo pgrx install --release --features remote_onnx && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
    \
    cd ../rag_jina_reranker_v1_tiny_en && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
        cargo pgrx install --release --features remote_onnx && \
@@ -976,29 +942,17 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p

 FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
 ARG PG_VERSION
-# version 0.3.3 supports v17
 # last release v0.3.3 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_JSONSCHEMA_VERSION=0.3.3 \
-        export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
    # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
    # `unsafe-postgres` feature allows to build pgx extensions
    # against postgres forks that decided to change their ABI name (like us).
    # With that we can build extensions without forking them and using stock
    # pgx. As this feature is new few manual version bumps were required.
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control

@@ -1012,24 +966,12 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
 ARG PG_VERSION

-# version 1.5.9 supports v17
 # last release v1.5.9 - Oct 16, 2024
-#
-# there were no breaking changes
-# so we can use the same version for all postgres versions
-RUN case "${PG_VERSION}" in \
-    "v14" | "v15" | "v16" | "v17") \
-        export PG_GRAPHQL_VERSION=1.5.9 \
-        export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
-    ;; \
-    *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-    ;; \
-    esac && \
-    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
-    echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
+    echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
    cargo pgrx install --release && \
    # it's needed to enable extension because it uses untrusted C language
    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -1050,33 +992,58 @@ ARG PG_VERSION
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
    echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    # TODO update pgrx version in the pg_tiktoken repo and remove this line
-    sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
-    sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
+    sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

 #########################################################################################
 #
 # Layer "pg-pgx-ulid-build"
-# Compile "pgx_ulid" extension
+# Compile "pgx_ulid" extension for v16 and below
 #
 #########################################################################################

 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION

-# doesn't support v17 yet
-# https://github.com/pksunkara/pgx_ulid/pull/52
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16") \
+        ;; \
+    *) \
+        echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
+        ;; \
    esac && \
    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
-    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
+    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17  pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx       = "^0.11.2"/pgrx       = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgrx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
+
+#########################################################################################
+#
+# Layer "pg-pgx-ulid-pgrx12-build"
+# Compile "pgx_ulid" extension for v17 and up
+#
+#########################################################################################
+
+FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        ;; \
+    *) \
+        echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control

 #########################################################################################
 #
@@ -1091,10 +1058,14 @@ ARG PG_VERSION
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
    cargo pgrx install --release

 #########################################################################################
@@ -1104,13 +1075,11 @@ RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2
 #
 #########################################################################################

-FROM build-deps AS wal2json-pg-build
+FROM pg-build AS wal2json-pg-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # wal2json wal2json_2_6 supports v17
 # last release wal2json_2_6 - Apr 25, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
    echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
@@ -1123,13 +1092,11 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.
 # compile pg_ivm extension
 #
 #########################################################################################
-FROM build-deps AS pg-ivm-build
+FROM pg-build AS pg-ivm-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # pg_ivm v1.9 supports v17
 # last release v1.9 - Jul 31
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
    echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
    mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
@@ -1143,13 +1110,11 @@ RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_iv
 # compile pg_partman extension
 #
 #########################################################################################
-FROM build-deps AS pg-partman-build
+FROM pg-build AS pg-partman-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 # should support v17 https://github.com/pgpartman/pg_partman/discussions/693
 # last release 5.1.0  Apr 2, 2024
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
    echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
    mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
@@ -1165,24 +1130,12 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
 #########################################################################################
 FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-# The topmost commit in the `neon` branch at the time of writing this
-# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
-# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
-ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
-ENV PATH="/usr/local/pgsql/bin/:$PATH"
-
-RUN case "${PG_VERSION}" in \
-        'v14') \
-            echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
-    esac && \
-    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
-    cd pg_mooncake-src && \
-    git checkout "${PG_MOONCAKE_VERSION}" && \
-    git submodule update --init --depth 1 --recursive && \
-    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
-    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
+RUN wget https://github.com/Mooncake-Labs/pg_mooncake/releases/download/v0.1.0/pg_mooncake-0.1.0.tar.gz -O pg_mooncake.tar.gz && \
+    echo "eafd059b77f541f11525eb8affcd66a176968cbd8fe7c0d436e733f2aa4da59f pg_mooncake.tar.gz" | sha256sum --check && \
+    mkdir pg_mooncake-src && cd pg_mooncake-src && tar xzf ../pg_mooncake.tar.gz --strip-components=1 -C . && \
+    make release -j $(getconf _NPROCESSORS_ONLN) && \
+    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control

 #########################################################################################
@@ -1192,11 +1145,8 @@ RUN case "${PG_VERSION}" in \
 #
 #########################################################################################

-FROM build-deps AS pg-repack-build
+FROM pg-build AS pg-repack-build
 ARG PG_VERSION
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH="/usr/local/pgsql/bin/:$PATH"

 RUN wget https://github.com/reorg/pg_repack/archive/refs/tags/ver_1.5.2.tar.gz -O pg_repack.tar.gz && \
    echo '4516cad42251ed3ad53ff619733004db47d5755acac83f75924cd94d1c4fb681 pg_repack.tar.gz' | sha256sum --check && \
@@ -1238,6 +1188,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1267,20 +1218,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_rmgr \
-        -s install && \
-    case "${PG_VERSION}" in \
-        "v14" | "v15") \
-        ;; \
-        "v16" | "v17") \
-            echo "Skipping HNSW for PostgreSQL ${PG_VERSION}" && exit 0 \
-        ;; \
-        *) \
-            echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-        esac && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/hnsw \
        -s install

 #########################################################################################
@@ -1297,17 +1234,6 @@ USER nonroot
 COPY --chown=nonroot . .
 RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy

-#########################################################################################
-#
-# Final compute-tools image
-#
-#########################################################################################
-
-FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
-
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/fast_import /usr/local/bin/fast_import
-
 #########################################################################################
 #
 # Layer "pgbouncer"
@@ -1340,15 +1266,16 @@ RUN set -e \

 #########################################################################################
 #
-# Layers "postgres-exporter" and "sql-exporter"
+# Layers "postgres-exporter", "pgbouncer-exporter", and "sql-exporter"
 #
 #########################################################################################

-FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
+FROM quay.io/prometheuscommunity/postgres-exporter:v0.16.0 AS postgres-exporter
+FROM quay.io/prometheuscommunity/pgbouncer-exporter:v0.10.2 AS pgbouncer-exporter

 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.17.0 AS sql-exporter

 #########################################################################################
 #
@@ -1426,9 +1353,6 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
-COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
-COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
 RUN cd /ext-src/ && for f in *.tar.gz; \
@@ -1439,9 +1363,6 @@ RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
 RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
 RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
-RUN case "${PG_VERSION}" in "v17") \
-    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
-    esac && patch -p1 </ext-src/pg_anon.patch
 RUN patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
@@ -1482,6 +1403,7 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy

 # Metrics exporter binaries and  configuration files
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
+COPY --from=pgbouncer-exporter /bin/pgbouncer_exporter /bin/pgbouncer_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter

 COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -19,6 +19,8 @@ max_prepared_statements=0
 admin_users=postgres
 unix_socket_dir=/tmp/
 unix_socket_mode=0777
+; required for pgbouncer_exporter
+ignore_startup_parameters=extra_float_digits

 ;; Disable connection logging. It produces a lot of logs that no one looks at,
 ;; and we can get similar log entries from the proxy too. We had incidents in
--- a/compute/patches/pgvector.patch
+++ b/compute/patches/pgvector.patch
@@ -1,8 +1,24 @@
+diff --git a/Makefile b/Makefile
+index 7a4b88c..56678af 100644
+--- a/Makefile
+++ b/Makefile
+@@ -3,7 +3,10 @@ EXTVERSION = 0.8.0
+ 
+ MODULE_big = vector
+ DATA = $(wildcard sql/*--*--*.sql)
+-DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql
+# This change is needed to install different per-version SQL files
+# like pgvector--0.8.0.sql and pgvector--0.7.4.sql
+# The corresponding file is downloaded during the Docker image build process
+DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql sql/vector--0.7.4.sql
+ OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o
+ HEADERS = src/halfvec.h src/sparsevec.h src/vector.h
+ 
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index dcfb2bd..d5189ee 100644
+index b667478..fc1897c 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
-@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+@@ -843,9 +843,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
 
 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
 
@@ -20,7 +36,7 @@ index dcfb2bd..d5189ee 100644
 	/* Close relations within worker */
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
-@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1100,12 +1108,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
 	SeedRandom(42);
 #endif
 
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -27,6 +27,10 @@ commands:
    user: nobody
    sysvInitAction: respawn
    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+  - name: pgbouncer-exporter
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -27,6 +27,10 @@ commands:
    user: nobody
    sysvInitAction: respawn
    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+  - name: pgbouncer-exporter
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/bin/pgbouncer_exporter --pgBouncer.connectionString="postgres:///pgbouncer?host=/tmp&port=6432&dbname=pgbouncer&user=pgbouncer"'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -15,6 +15,7 @@ aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-sdk-kms.workspace = true
 anyhow.workspace = true
+axum = { workspace = true, features = [] }
 camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
@@ -22,7 +23,7 @@ clap.workspace = true
 fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
-hyper0 = { workspace = true, features = ["full"] }
+http.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
@@ -37,6 +38,8 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
+tower.workspace = true
+tower-http.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
@@ -48,6 +51,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
+uuid.workspace = true
 prometheus.workspace = true

 postgres_initdb.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -60,7 +60,7 @@ use compute_tools::compute::{
 };
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::api::launch_http_server;
+use compute_tools::http::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -111,11 +111,6 @@ fn main() -> Result<()> {
 fn init() -> Result<(String, clap::ArgMatches)> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    opentelemetry::global::set_error_handler(|err| {
-        tracing::info!("OpenTelemetry error: {err}");
-    })
-    .expect("global error handler lock poisoned");
-
    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
    thread::spawn(move || {
        for sig in signals.forever() {
@@ -493,7 +488,10 @@ fn start_postgres(
    let mut pg = None;
    if !prestartup_failed {
        pg = match compute.start_compute() {
-            Ok(pg) => Some(pg),
+            Ok(pg) => {
+                info!(postmaster_pid = %pg.0.id(), "Postgres was started");
+                Some(pg)
+            }
            Err(err) => {
                error!("could not start the compute node: {:#}", err);
                compute.set_failed_status(err);
@@ -591,6 +589,8 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
    // propagate to Postgres and it will be shut down as well.
    let mut exit_code = None;
    if let Some((mut pg, logs_handle)) = pg {
+        info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
+
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -17,7 +17,7 @@
 //!
 //! # Local Testing
 //!
-//! - Comment out most of the pgxns in The Dockerfile.compute-tools to speed up the build.
+//! - Comment out most of the pgxns in compute-node.Dockerfile to speed up the build.
 //! - Build the image with the following command:
 //!
 //! ```bash
@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::Parser;
 use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
-use tracing::{info, info_span, warn, Instrument};
+use tracing::{error, info, info_span, warn, Instrument};
 use utils::fs_ext::is_directory_empty;

 #[path = "fast_import/aws_s3_sync.rs"]
@@ -41,16 +41,25 @@ mod child_stdio_to_log;
 #[path = "fast_import/s3_uri.rs"]
 mod s3_uri;

+const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
+const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
+
 #[derive(clap::Parser)]
 struct Args {
    #[clap(long)]
    working_directory: Utf8PathBuf,
    #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
-    s3_prefix: s3_uri::S3Uri,
+    s3_prefix: Option<s3_uri::S3Uri>,
+    #[clap(long)]
+    source_connection_string: Option<String>,
+    #[clap(short, long)]
+    interactive: bool,
    #[clap(long)]
    pg_bin_dir: Utf8PathBuf,
    #[clap(long)]
    pg_lib_dir: Utf8PathBuf,
+    #[clap(long)]
+    pg_port: Option<u16>, // port to run postgres on, 5432 is default
 }

 #[serde_with::serde_as]
@@ -67,6 +76,13 @@ enum EncryptionSecret {
    KMS { key_id: String },
 }

+// copied from pageserver_api::config::defaults::DEFAULT_LOCALE to avoid dependency just for a constant
+const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+    "C"
+} else {
+    "C.UTF-8"
+};
+
 #[tokio::main]
 pub(crate) async fn main() -> anyhow::Result<()> {
    utils::logging::init(
@@ -77,30 +93,74 @@ pub(crate) async fn main() -> anyhow::Result<()> {

    info!("starting");

-    let Args {
-        working_directory,
-        s3_prefix,
-        pg_bin_dir,
-        pg_lib_dir,
-    } = Args::parse();
+    let args = Args::parse();

-    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+    // Validate arguments
+    if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
+        anyhow::bail!("either s3_prefix or source_connection_string must be specified");
+    }
+    if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
+        anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
+    }

-    let spec: Spec = {
-        let spec_key = s3_prefix.append("/spec.json");
-        let s3_client = aws_sdk_s3::Client::new(&aws_config);
-        let object = s3_client
-            .get_object()
-            .bucket(&spec_key.bucket)
-            .key(spec_key.key)
-            .send()
-            .await
-            .context("get spec from s3")?
-            .body
-            .collect()
-            .await
-            .context("download spec body")?;
-        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    let working_directory = args.working_directory;
+    let pg_bin_dir = args.pg_bin_dir;
+    let pg_lib_dir = args.pg_lib_dir;
+    let pg_port = args.pg_port.unwrap_or_else(|| {
+        info!("pg_port not specified, using default 5432");
+        5432
+    });
+
+    // Initialize AWS clients only if s3_prefix is specified
+    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let kms = aws_sdk_kms::Client::new(&config);
+        (Some(config), Some(kms))
+    } else {
+        (None, None)
+    };
+
+    // Get source connection string either from S3 spec or direct argument
+    let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
+        let spec: Spec = {
+            let spec_key = s3_prefix.append("/spec.json");
+            let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
+            let object = s3_client
+                .get_object()
+                .bucket(&spec_key.bucket)
+                .key(spec_key.key)
+                .send()
+                .await
+                .context("get spec from s3")?
+                .body
+                .collect()
+                .await
+                .context("download spec body")?;
+            serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+        };
+
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let mut output = kms_client
+                    .unwrap()
+                    .decrypt()
+                    .key_id(key_id)
+                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+                        spec.source_connstring_ciphertext_base64,
+                    ))
+                    .send()
+                    .await
+                    .context("decrypt source connection string")?;
+                let plaintext = output
+                    .plaintext
+                    .take()
+                    .context("get plaintext source connection string")?;
+                String::from_utf8(plaintext.into_inner())
+                    .context("parse source connection string as utf8")?
+            }
+        }
+    } else {
+        args.source_connection_string.unwrap()
    };

    match tokio::fs::create_dir(&working_directory).await {
@@ -123,15 +183,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        .await
        .context("create pgdata directory")?;

-    //
-    // Setup clients
-    //
-    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
-    let kms_client = aws_sdk_kms::Client::new(&aws_config);
-
-    //
-    //  Initialize pgdata
-    //
    let pgbin = pg_bin_dir.join("postgres");
    let pg_version = match get_pg_version(pgbin.as_ref()) {
        PostgresMajorVersion::V14 => 14,
@@ -142,7 +193,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
    let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
    postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
        superuser,
-        locale: "en_US.UTF-8", // XXX: this shouldn't be hard-coded,
+        locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
        pg_version,
        initdb_bin: pg_bin_dir.join("initdb").as_ref(),
        library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
@@ -159,6 +210,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
    let mut postgres_proc = tokio::process::Command::new(pgbin)
        .arg("-D")
        .arg(&pgdata_dir)
+        .args(["-p", &format!("{pg_port}")])
        .args(["-c", "wal_level=minimal"])
        .args(["-c", "shared_buffers=10GB"])
        .args(["-c", "max_wal_senders=0"])
@@ -170,8 +222,15 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        .args(["-c", &format!("max_parallel_workers={nproc}")])
        .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
        .args(["-c", &format!("max_worker_processes={nproc}")])
-        .args(["-c", "effective_io_concurrency=100"])
+        .args([
+            "-c",
+            &format!(
+                "effective_io_concurrency={}",
+                if cfg!(target_os = "macos") { 0 } else { 100 }
+            ),
+        ])
        .env_clear()
+        .env("LD_LIBRARY_PATH", &pg_lib_dir)
        .stdout(std::process::Stdio::piped())
        .stderr(std::process::Stdio::piped())
        .spawn()
@@ -185,44 +244,58 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        )
        .instrument(info_span!("postgres")),
    );
+
+    // Create neondb database in the running postgres
    let restore_pg_connstring =
-        format!("host=localhost port=5432 user={superuser} dbname=postgres");
+        format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
+
+    let start_time = std::time::Instant::now();
+
    loop {
-        let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
-        if res.is_ok() {
-            info!("postgres is ready, could connect to it");
-            break;
+        if start_time.elapsed() > PG_WAIT_TIMEOUT {
+            error!(
+                "timeout exceeded: failed to poll postgres and create database within 10 minutes"
+            );
+            std::process::exit(1);
+        }
+
+        match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
+            Ok((client, connection)) => {
+                // Spawn the connection handling task to maintain the connection
+                tokio::spawn(async move {
+                    if let Err(e) = connection.await {
+                        warn!("connection error: {}", e);
+                    }
+                });
+
+                match client.simple_query("CREATE DATABASE neondb;").await {
+                    Ok(_) => {
+                        info!("created neondb database");
+                        break;
+                    }
+                    Err(e) => {
+                        warn!(
+                            "failed to create database: {}, retying in {}s",
+                            e,
+                            PG_WAIT_RETRY_INTERVAL.as_secs_f32()
+                        );
+                        tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
+                        continue;
+                    }
+                }
+            }
+            Err(_) => {
+                info!(
+                    "postgres not ready yet, retrying in {}s",
+                    PG_WAIT_RETRY_INTERVAL.as_secs_f32()
+                );
+                tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
+                continue;
+            }
        }
    }

-    //
-    // Decrypt connection string
-    //
-    let source_connection_string = {
-        match spec.encryption_secret {
-            EncryptionSecret::KMS { key_id } => {
-                let mut output = kms_client
-                    .decrypt()
-                    .key_id(key_id)
-                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
-                        spec.source_connstring_ciphertext_base64,
-                    ))
-                    .send()
-                    .await
-                    .context("decrypt source connection string")?;
-                let plaintext = output
-                    .plaintext
-                    .take()
-                    .context("get plaintext source connection string")?;
-                String::from_utf8(plaintext.into_inner())
-                    .context("parse source connection string as utf8")?
-            }
-        }
-    };
-
-    //
-    // Start the work
-    //
+    let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");

    let dumpdir = working_directory.join("dumpdir");

@@ -256,6 +329,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            .arg(&source_connection_string)
            // how we run it
            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir)
            .kill_on_drop(true)
            .stdout(std::process::Stdio::piped())
            .stderr(std::process::Stdio::piped())
@@ -289,6 +363,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            .arg(&dumpdir)
            // how we run it
            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir)
            .kill_on_drop(true)
            .stdout(std::process::Stdio::piped())
            .stderr(std::process::Stdio::piped())
@@ -310,6 +385,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        }
    }

+    // If interactive mode, wait for Ctrl+C
+    if args.interactive {
+        info!("Running in interactive mode. Press Ctrl+C to shut down.");
+        tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
+    }
+
    info!("shutdown postgres");
    {
        nix::sys::signal::kill(
@@ -325,21 +406,24 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            .context("wait for postgres to shut down")?;
    }

-    info!("upload pgdata");
-    aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
-        .await
-        .context("sync dump directory to destination")?;
-
-    info!("write status");
-    {
-        let status_dir = working_directory.join("status");
-        std::fs::create_dir(&status_dir).context("create status directory")?;
-        let status_file = status_dir.join("pgdata");
-        std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
-            .context("write status file")?;
-        aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
+    // Only sync if s3_prefix was specified
+    if let Some(s3_prefix) = args.s3_prefix {
+        info!("upload pgdata");
+        aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
            .await
-            .context("sync status directory to destination")?;
+            .context("sync dump directory to destination")?;
+
+        info!("write status");
+        {
+            let status_dir = working_directory.join("status");
+            std::fs::create_dir(&status_dir).context("create status directory")?;
+            let status_file = status_dir.join("pgdata");
+            std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
+                .context("write status file")?;
+            aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
+                .await
+                .context("sync status directory to destination")?;
+        }
    }

    Ok(())
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -36,11 +36,11 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat

 #[derive(Debug, thiserror::Error)]
 pub enum SchemaDumpError {
-    #[error("Database does not exist.")]
+    #[error("database does not exist")]
    DatabaseDoesNotExist,
-    #[error("Failed to execute pg_dump.")]
+    #[error("failed to execute pg_dump")]
    IO(#[from] std::io::Error),
-    #[error("Unexpected error.")]
+    #[error("unexpected I/O error")]
    Unexpected,
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -15,7 +15,7 @@ use std::time::Instant;

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use compute_api::spec::{PgIdent, Role};
+use compute_api::spec::{Database, PgIdent, Role};
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -41,12 +41,14 @@ use crate::local_proxy;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSuperUser,
-    DropInvalidDatabases, DropRoles, HandleNeonExtension, HandleOtherExtensions,
-    RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
+    CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
+    HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
+    RunInEachDatabase,
 };
+use crate::spec_apply::PerDatabasePhase;
 use crate::spec_apply::PerDatabasePhase::{
-    ChangeSchemaPerms, DeleteDBRoleReferences, HandleAnonExtension,
+    ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
 };
 use crate::spec_apply::{apply_operations, MutableApplyContext, DB};
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -338,6 +340,15 @@ impl ComputeNode {
        self.state.lock().unwrap().status
    }

+    pub fn get_timeline_id(&self) -> Option<TimelineId> {
+        self.state
+            .lock()
+            .unwrap()
+            .pspec
+            .as_ref()
+            .map(|s| s.timeline_id)
+    }
+
    // Remove `pgdata` directory and create it again with right permissions.
    fn create_pgdata(&self) -> Result<()> {
        // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
@@ -834,7 +845,7 @@ impl ComputeNode {
        conf
    }

-    async fn get_maintenance_client(
+    pub async fn get_maintenance_client(
        conf: &tokio_postgres::Config,
    ) -> Result<tokio_postgres::Client> {
        let mut conf = conf.clone();
@@ -927,6 +938,48 @@ impl ComputeNode {
                .map(|role| (role.name.clone(), role))
                .collect::<HashMap<String, Role>>();

+            // Check if we need to drop subscriptions before starting the endpoint.
+            //
+            // It is important to do this operation exactly once when endpoint starts on a new branch.
+            // Otherwise, we may drop not inherited, but newly created subscriptions.
+            //
+            // We cannot rely only on spec.drop_subscriptions_before_start flag,
+            // because if for some reason compute restarts inside VM,
+            // it will start again with the same spec and flag value.
+            //
+            // To handle this, we save the fact of the operation in the database
+            // in the neon.drop_subscriptions_done table.
+            // If the table does not exist, we assume that the operation was never performed, so we must do it.
+            // If table exists, we check if the operation was performed on the current timelilne.
+            //
+            let mut drop_subscriptions_done = false;
+
+            if spec.drop_subscriptions_before_start {
+                let timeline_id = self.get_timeline_id().context("timeline_id must be set")?;
+                let query = format!("select 1 from neon.drop_subscriptions_done where timeline_id = '{}'", timeline_id);
+
+                info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
+
+                drop_subscriptions_done =  match
+                    client.simple_query(&query).await {
+                    Ok(result) => {
+                        matches!(&result[0], postgres::SimpleQueryMessage::Row(_))
+                    },
+                    Err(e) =>
+                    {
+                        match e.code() {
+                            Some(&SqlState::UNDEFINED_TABLE) => false,
+                            _ => {
+                                // We don't expect any other error here, except for the schema/table not existing
+                                error!("Error checking if drop subscription operation was already performed: {}", e);
+                                return Err(e.into());
+                            }
+                        }
+                    }
+                }
+            };
+
+
            let jwks_roles = Arc::new(
                spec.as_ref()
                    .local_proxy_config
@@ -943,6 +996,78 @@ impl ComputeNode {
                dbs: databases,
            }));

+            // Apply special pre drop database phase.
+            // NOTE: we use the code of RunInEachDatabase phase for parallelism
+            // and connection management, but we don't really run it in *each* database,
+            // only in databases, we're about to drop.
+            info!("Applying PerDatabase (pre-dropdb) phase");
+            let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
+
+            // Run the phase for each database that we're about to drop.
+            let db_processes = spec
+                .delta_operations
+                .iter()
+                .flatten()
+                .filter_map(move |op| {
+                    if op.action.as_str() == "delete_db" {
+                        Some(op.name.clone())
+                    } else {
+                        None
+                    }
+                })
+                .map(|dbname| {
+                    let spec = spec.clone();
+                    let ctx = ctx.clone();
+                    let jwks_roles = jwks_roles.clone();
+                    let mut conf = conf.as_ref().clone();
+                    let concurrency_token = concurrency_token.clone();
+                    // We only need dbname field for this phase, so set other fields to dummy values
+                    let db = DB::UserDB(Database {
+                        name: dbname.clone(),
+                        owner: "cloud_admin".to_string(),
+                        options: None,
+                        restrict_conn: false,
+                        invalid: false,
+                    });
+
+                    debug!("Applying per-database phases for Database {:?}", &db);
+
+                    match &db {
+                        DB::SystemDB => {}
+                        DB::UserDB(db) => {
+                            conf.dbname(db.name.as_str());
+                        }
+                    }
+
+                    let conf = Arc::new(conf);
+                    let fut = Self::apply_spec_sql_db(
+                        spec.clone(),
+                        conf,
+                        ctx.clone(),
+                        jwks_roles.clone(),
+                        concurrency_token.clone(),
+                        db,
+                        [DropLogicalSubscriptions].to_vec(),
+                    );
+
+                    Ok(spawn(fut))
+                })
+                .collect::<Vec<Result<_, anyhow::Error>>>();
+
+            for process in db_processes.into_iter() {
+                let handle = process?;
+                if let Err(e) = handle.await? {
+                    // Handle the error case where the database does not exist
+                    // We do not check whether the DB exists or not in the deletion phase,
+                    // so we shouldn't be strict about it in pre-deletion cleanup as well.
+                    if e.to_string().contains("does not exist") {
+                        warn!("Error dropping subscription: {}", e);
+                    } else {
+                        return Err(e);
+                    }
+                };
+            }
+
            for phase in [
                CreateSuperUser,
                DropInvalidDatabases,
@@ -950,6 +1075,7 @@ impl ComputeNode {
                CreateAndAlterRoles,
                RenameAndDeleteDatabases,
                CreateAndAlterDatabases,
+                CreateSchemaNeon,
            ] {
                info!("Applying phase {:?}", &phase);
                apply_operations(
@@ -962,7 +1088,7 @@ impl ComputeNode {
                .await?;
            }

-            info!("Applying RunInEachDatabase phase");
+            info!("Applying RunInEachDatabase2 phase");
            let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));

            let db_processes = spec
@@ -990,6 +1116,17 @@ impl ComputeNode {
                    }

                    let conf = Arc::new(conf);
+                    let mut phases = vec![
+                        DeleteDBRoleReferences,
+                        ChangeSchemaPerms,
+                        HandleAnonExtension,
+                    ];
+
+                    if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
+                        info!("Adding DropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
+                        phases.push(DropLogicalSubscriptions);
+                    }
+
                    let fut = Self::apply_spec_sql_db(
                        spec.clone(),
                        conf,
@@ -997,6 +1134,7 @@ impl ComputeNode {
                        jwks_roles.clone(),
                        concurrency_token.clone(),
                        db,
+                        phases,
                    );

                    Ok(spawn(fut))
@@ -1008,12 +1146,20 @@ impl ComputeNode {
                handle.await??;
            }

-            for phase in vec![
+            let mut phases = vec![
                HandleOtherExtensions,
-                HandleNeonExtension,
+                HandleNeonExtension, // This step depends on CreateSchemaNeon
                CreateAvailabilityCheck,
                DropRoles,
-            ] {
+            ];
+
+            // This step depends on CreateSchemaNeon
+            if spec.drop_subscriptions_before_start && !drop_subscriptions_done {
+                info!("Adding FinalizeDropLogicalSubscriptions phase because drop_subscriptions_before_start is set");
+                phases.push(FinalizeDropLogicalSubscriptions);
+            }
+
+            for phase in phases {
                debug!("Applying phase {:?}", &phase);
                apply_operations(
                    spec.clone(),
@@ -1043,16 +1189,13 @@ impl ComputeNode {
        jwks_roles: Arc<HashSet<String>>,
        concurrency_token: Arc<tokio::sync::Semaphore>,
        db: DB,
+        subphases: Vec<PerDatabasePhase>,
    ) -> Result<()> {
        let _permit = concurrency_token.acquire().await?;

        let mut client_conn = None;

-        for subphase in [
-            DeleteDBRoleReferences,
-            ChangeSchemaPerms,
-            HandleAnonExtension,
-        ] {
+        for subphase in subphases {
            apply_operations(
                spec.clone(),
                ctx.clone(),
@@ -1386,6 +1529,14 @@ impl ComputeNode {
                        Ok(())
                    },
                )?;
+
+                let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+                if config::line_in_file(
+                    &postgresql_conf_path,
+                    "neon.disable_logical_replication_subscribers=false",
+                )? {
+                    info!("updated postgresql.conf to set neon.disable_logical_replication_subscribers=false");
+                }
                self.pg_reload_conf()?;
            }
            self.post_apply_config()?;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -129,6 +129,13 @@ pub fn write_postgres_conf(

    writeln!(file, "neon.extension_server_port={}", extension_server_port)?;

+    if spec.drop_subscriptions_before_start {
+        writeln!(file, "neon.disable_logical_replication_subscribers=true")?;
+    } else {
+        // be explicit about the default value
+        writeln!(file, "neon.disable_logical_replication_subscribers=false")?;
+    }
+
    // This is essential to keep this line at the end of the file,
    // because it is intended to override any settings above.
    writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -1,606 +0,0 @@
-use std::convert::Infallible;
-use std::net::IpAddr;
-use std::net::Ipv6Addr;
-use std::net::SocketAddr;
-use std::sync::Arc;
-use std::thread;
-
-use crate::catalog::SchemaDumpError;
-use crate::catalog::{get_database_schema, get_dbs_and_roles};
-use crate::compute::forward_termination_signal;
-use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use crate::installed_extensions;
-use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
-use compute_api::responses::{
-    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
-    SetRoleGrantsResponse,
-};
-
-use anyhow::Result;
-use hyper::header::CONTENT_TYPE;
-use hyper::service::{make_service_fn, service_fn};
-use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use metrics::proto::MetricFamily;
-use metrics::Encoder;
-use metrics::TextEncoder;
-use tokio::task;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
-use tracing_utils::http::OtelName;
-use utils::failpoint_support::failpoints_handler;
-use utils::http::error::ApiError;
-use utils::http::request::must_get_query_param;
-
-fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
-    ComputeStatusResponse {
-        start_time: state.start_time,
-        tenant: state
-            .pspec
-            .as_ref()
-            .map(|pspec| pspec.tenant_id.to_string()),
-        timeline: state
-            .pspec
-            .as_ref()
-            .map(|pspec| pspec.timeline_id.to_string()),
-        status: state.status,
-        last_active: state.last_active,
-        error: state.error.clone(),
-    }
-}
-
-// Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
-    //
-    // NOTE: The URI path is currently included in traces. That's OK because
-    // it doesn't contain any variable parts or sensitive information. But
-    // please keep that in mind if you change the routing here.
-    //
-    match (req.method(), req.uri().path()) {
-        // Serialized compute state.
-        (&Method::GET, "/status") => {
-            debug!("serving /status GET request");
-            let state = compute.state.lock().unwrap();
-            let status_response = status_response_from_state(&state);
-            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
-        }
-
-        // Startup metrics in JSON format. Keep /metrics reserved for a possible
-        // future use for Prometheus metrics format.
-        (&Method::GET, "/metrics.json") => {
-            info!("serving /metrics.json GET request");
-            let metrics = compute.state.lock().unwrap().metrics.clone();
-            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
-        }
-
-        // Prometheus metrics
-        (&Method::GET, "/metrics") => {
-            debug!("serving /metrics GET request");
-
-            // When we call TextEncoder::encode() below, it will immediately
-            // return an error if a metric family has no metrics, so we need to
-            // preemptively filter out metric families with no metrics.
-            let metrics = installed_extensions::collect()
-                .into_iter()
-                .filter(|m| !m.get_metric().is_empty())
-                .collect::<Vec<MetricFamily>>();
-
-            let encoder = TextEncoder::new();
-            let mut buffer = vec![];
-
-            if let Err(err) = encoder.encode(&metrics, &mut buffer) {
-                let msg = format!("error handling /metrics request: {err}");
-                error!(msg);
-                return render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR);
-            }
-
-            match Response::builder()
-                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, encoder.format_type())
-                .body(Body::from(buffer))
-            {
-                Ok(response) => response,
-                Err(err) => {
-                    let msg = format!("error handling /metrics request: {err}");
-                    error!(msg);
-                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-        // Collect Postgres current usage insights
-        (&Method::GET, "/insights") => {
-            info!("serving /insights GET request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!("compute is not running, current status: {:?}", status);
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let insights = compute.collect_insights().await;
-            Response::new(Body::from(insights))
-        }
-
-        (&Method::POST, "/check_writability") => {
-            info!("serving /check_writability POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for check_writability request: {:?}",
-                    status
-                );
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let res = crate::checker::check_writability(compute).await;
-            match res {
-                Ok(_) => Response::new(Body::from("true")),
-                Err(e) => {
-                    error!("check_writability failed: {}", e);
-                    Response::new(Body::from(e.to_string()))
-                }
-            }
-        }
-
-        (&Method::POST, "/extensions") => {
-            info!("serving /extensions POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for extensions request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
-            let res = compute
-                .install_extension(&request.extension, &request.database, request.version)
-                .await;
-            match res {
-                Ok(version) => render_json(Body::from(
-                    serde_json::to_string(&ExtensionInstallResult {
-                        extension: request.extension,
-                        version,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => {
-                    error!("install_extension failed: {}", e);
-                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::GET, "/info") => {
-            let num_cpus = num_cpus::get_physical();
-            info!("serving /info GET request. num_cpus: {}", num_cpus);
-            Response::new(Body::from(
-                serde_json::json!({
-                    "num_cpus": num_cpus,
-                })
-                .to_string(),
-            ))
-        }
-
-        // Accept spec in JSON format and request compute configuration. If
-        // anything goes wrong after we set the compute status to `ConfigurationPending`
-        // and update compute state with new spec, we basically leave compute
-        // in the potentially wrong state. That said, it's control-plane's
-        // responsibility to watch compute state after reconfiguration request
-        // and to clean restart in case of errors.
-        (&Method::POST, "/configure") => {
-            info!("serving /configure POST request");
-            match handle_configure_request(req, compute).await {
-                Ok(msg) => Response::new(Body::from(msg)),
-                Err((msg, code)) => {
-                    error!("error handling /configure request: {msg}");
-                    render_json_error(&msg, code)
-                }
-            }
-        }
-
-        (&Method::POST, "/terminate") => {
-            info!("serving /terminate POST request");
-            match handle_terminate_request(compute).await {
-                Ok(()) => Response::new(Body::empty()),
-                Err((msg, code)) => {
-                    error!("error handling /terminate request: {msg}");
-                    render_json_error(&msg, code)
-                }
-            }
-        }
-
-        (&Method::GET, "/dbs_and_roles") => {
-            info!("serving /dbs_and_roles GET request",);
-            match get_dbs_and_roles(compute).await {
-                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
-                Err(_) => {
-                    render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::GET, "/database_schema") => {
-            let database = match must_get_query_param(&req, "database") {
-                Err(e) => return e.into_response(),
-                Ok(database) => database,
-            };
-            info!("serving /database_schema GET request with database: {database}",);
-            match get_database_schema(compute, &database).await {
-                Ok(res) => render_plain(Body::wrap_stream(res)),
-                Err(SchemaDumpError::DatabaseDoesNotExist) => {
-                    render_json_error("database does not exist", StatusCode::NOT_FOUND)
-                }
-                Err(e) => {
-                    error!("can't get schema dump: {}", e);
-                    render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        (&Method::POST, "/grants") => {
-            info!("serving /grants POST request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for set_role_grants request: {:?}",
-                    status
-                );
-                error!(msg);
-                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
-            }
-
-            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
-            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
-
-            let res = compute
-                .set_role_grants(
-                    &request.database,
-                    &request.schema,
-                    &request.privileges,
-                    &request.role,
-                )
-                .await;
-            match res {
-                Ok(()) => render_json(Body::from(
-                    serde_json::to_string(&SetRoleGrantsResponse {
-                        database: request.database,
-                        schema: request.schema,
-                        role: request.role,
-                        privileges: request.privileges,
-                    })
-                    .unwrap(),
-                )),
-                Err(e) => render_json_error(
-                    &format!("could not grant role privileges to the schema: {e}"),
-                    // TODO: can we filter on role/schema not found errors
-                    // and return appropriate error code?
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                ),
-            }
-        }
-
-        // get the list of installed extensions
-        // currently only used in python tests
-        // TODO: call it from cplane
-        (&Method::GET, "/installed_extensions") => {
-            info!("serving /installed_extensions GET request");
-            let status = compute.get_status();
-            if status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for extensions request: {:?}",
-                    status
-                );
-                error!(msg);
-                return Response::new(Body::from(msg));
-            }
-
-            let conf = compute.get_conn_conf(None);
-            let res =
-                task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
-                    .await
-                    .unwrap();
-
-            match res {
-                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
-                Err(e) => render_json_error(
-                    &format!("could not get list of installed extensions: {}", e),
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                ),
-            }
-        }
-
-        (&Method::POST, "/failpoints") if cfg!(feature = "testing") => {
-            match failpoints_handler(req, CancellationToken::new()).await {
-                Ok(r) => r,
-                Err(ApiError::BadRequest(e)) => {
-                    render_json_error(&e.to_string(), StatusCode::BAD_REQUEST)
-                }
-                Err(_) => {
-                    render_json_error("Internal server error", StatusCode::INTERNAL_SERVER_ERROR)
-                }
-            }
-        }
-
-        // download extension files from remote extension storage on demand
-        (&Method::POST, route) if route.starts_with("/extension_server/") => {
-            info!("serving {:?} POST request", route);
-            info!("req.uri {:?}", req.uri());
-
-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
-
-            let mut is_library = false;
-            if let Some(params) = req.uri().query() {
-                info!("serving {:?} POST request with params: {}", route, params);
-                if params == "is_library=true" {
-                    is_library = true;
-                } else {
-                    let mut resp = Response::new(Body::from("Wrong request parameters"));
-                    *resp.status_mut() = StatusCode::BAD_REQUEST;
-                    return resp;
-                }
-            }
-            let filename = route.split('/').last().unwrap().to_string();
-            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
-
-            // get ext_name and path from spec
-            // don't lock compute_state for too long
-            let ext = {
-                let compute_state = compute.state.lock().unwrap();
-                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-                let spec = &pspec.spec;
-
-                // debug only
-                info!("spec: {:?}", spec);
-
-                let remote_extensions = match spec.remote_extensions.as_ref() {
-                    Some(r) => r,
-                    None => {
-                        info!("no remote extensions spec was provided");
-                        let mut resp = Response::new(Body::from("no remote storage configured"));
-                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                        return resp;
-                    }
-                };
-
-                remote_extensions.get_ext(
-                    &filename,
-                    is_library,
-                    &compute.build_tag,
-                    &compute.pgversion,
-                )
-            };
-
-            match ext {
-                Ok((ext_name, ext_path)) => {
-                    match compute.download_extension(ext_name, ext_path).await {
-                        Ok(_) => Response::new(Body::from("OK")),
-                        Err(e) => {
-                            error!("extension download failed: {}", e);
-                            let mut resp = Response::new(Body::from(e.to_string()));
-                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                            resp
-                        }
-                    }
-                }
-                Err(e) => {
-                    warn!("extension download failed to find extension: {}", e);
-                    let mut resp = Response::new(Body::from("failed to find file"));
-                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                    resp
-                }
-            }
-        }
-
-        // Return the `404 Not Found` for any other routes.
-        _ => {
-            let mut not_found = Response::new(Body::from("404 Not Found"));
-            *not_found.status_mut() = StatusCode::NOT_FOUND;
-            not_found
-        }
-    }
-}
-
-async fn handle_configure_request(
-    req: Request<Body>,
-    compute: &Arc<ComputeNode>,
-) -> Result<String, (String, StatusCode)> {
-    if !compute.live_config_allowed {
-        return Err((
-            "live configuration is not allowed for this compute node".to_string(),
-            StatusCode::PRECONDITION_FAILED,
-        ));
-    }
-
-    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
-    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
-    if let Ok(request) = serde_json::from_str::<ConfigurationRequest>(&spec_raw) {
-        let spec = request.spec;
-
-        let parsed_spec = match ParsedSpec::try_from(spec) {
-            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
-        };
-
-        // XXX: wrap state update under lock in code blocks. Otherwise,
-        // we will try to `Send` `mut state` into the spawned thread
-        // bellow, which will cause error:
-        // ```
-        // error: future cannot be sent between threads safely
-        // ```
-        {
-            let mut state = compute.state.lock().unwrap();
-            if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
-                let msg = format!(
-                    "invalid compute status for configuration request: {:?}",
-                    state.status.clone()
-                );
-                return Err((msg, StatusCode::PRECONDITION_FAILED));
-            }
-            state.pspec = Some(parsed_spec);
-            state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
-            drop(state);
-            info!("set new spec and notified waiters");
-        }
-
-        // Spawn a blocking thread to wait for compute to become Running.
-        // This is needed to do not block the main pool of workers and
-        // be able to serve other requests while some particular request
-        // is waiting for compute to finish configuration.
-        let c = compute.clone();
-        task::spawn_blocking(move || {
-            let mut state = c.state.lock().unwrap();
-            while state.status != ComputeStatus::Running {
-                state = c.state_changed.wait(state).unwrap();
-                info!(
-                    "waiting for compute to become Running, current status: {:?}",
-                    state.status
-                );
-
-                if state.status == ComputeStatus::Failed {
-                    let err = state.error.as_ref().map_or("unknown error", |x| x);
-                    let msg = format!("compute configuration failed: {:?}", err);
-                    return Err((msg, StatusCode::INTERNAL_SERVER_ERROR));
-                }
-            }
-
-            Ok(())
-        })
-        .await
-        .unwrap()?;
-
-        // Return current compute state if everything went well.
-        let state = compute.state.lock().unwrap().clone();
-        let status_response = status_response_from_state(&state);
-        Ok(serde_json::to_string(&status_response).unwrap())
-    } else {
-        Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST))
-    }
-}
-
-fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
-    let error = GenericAPIError {
-        error: e.to_string(),
-    };
-    Response::builder()
-        .status(status)
-        .header(CONTENT_TYPE, "application/json")
-        .body(Body::from(serde_json::to_string(&error).unwrap()))
-        .unwrap()
-}
-
-fn render_json(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "application/json")
-        .body(body)
-        .unwrap()
-}
-
-fn render_plain(body: Body) -> Response<Body> {
-    Response::builder()
-        .header(CONTENT_TYPE, "text/plain")
-        .body(body)
-        .unwrap()
-}
-
-async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
-    {
-        let mut state = compute.state.lock().unwrap();
-        if state.status == ComputeStatus::Terminated {
-            return Ok(());
-        }
-        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
-            let msg = format!(
-                "invalid compute status for termination request: {}",
-                state.status
-            );
-            return Err((msg, StatusCode::PRECONDITION_FAILED));
-        }
-        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
-        drop(state);
-    }
-
-    forward_termination_signal();
-    info!("sent signal and notified waiters");
-
-    // Spawn a blocking thread to wait for compute to become Terminated.
-    // This is needed to do not block the main pool of workers and
-    // be able to serve other requests while some particular request
-    // is waiting for compute to finish configuration.
-    let c = compute.clone();
-    task::spawn_blocking(move || {
-        let mut state = c.state.lock().unwrap();
-        while state.status != ComputeStatus::Terminated {
-            state = c.state_changed.wait(state).unwrap();
-            info!(
-                "waiting for compute to become {}, current status: {:?}",
-                ComputeStatus::Terminated,
-                state.status
-            );
-        }
-
-        Ok(())
-    })
-    .await
-    .unwrap()?;
-    info!("terminated Postgres");
-    Ok(())
-}
-
-// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
-#[tokio::main]
-async fn serve(port: u16, state: Arc<ComputeNode>) {
-    // this usually binds to both IPv4 and IPv6 on linux
-    // see e.g. https://github.com/rust-lang/rust/pull/34440
-    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
-
-    let make_service = make_service_fn(move |_conn| {
-        let state = state.clone();
-        async move {
-            Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
-                let state = state.clone();
-                async move {
-                    Ok::<_, Infallible>(
-                        // NOTE: We include the URI path in the string. It
-                        // doesn't contain any variable parts or sensitive
-                        // information in this API.
-                        tracing_utils::http::tracing_handler(
-                            req,
-                            |req| routes(req, &state),
-                            OtelName::UriPath,
-                        )
-                        .await,
-                    )
-                }
-            }))
-        }
-    });
-
-    info!("starting HTTP server on {}", addr);
-
-    let server = Server::bind(&addr).serve(make_service);
-
-    // Run this server forever
-    if let Err(e) = server.await {
-        error!("server error: {}", e);
-    }
-}
-
-/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
-    let state = Arc::clone(state);
-
-    Ok(thread::Builder::new()
-        .name("http-endpoint".into())
-        .spawn(move || serve(port, state))?)
-}
--- a/compute_tools/src/http/extract/json.rs
+++ b/compute_tools/src/http/extract/json.rs
@@ -0,0 +1,44 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::extract::{rejection::JsonRejection, FromRequest, Request};
+use compute_api::responses::GenericAPIError;
+use http::StatusCode;
+
+/// Custom `Json` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Json<T>(pub T);
+
+impl<S, T> FromRequest<S> for Json<T>
+where
+    axum::Json<T>: FromRequest<S, Rejection = JsonRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::Json::<T>::from_request(req, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Json<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Json<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -0,0 +1,7 @@
+pub(crate) mod json;
+pub(crate) mod path;
+pub(crate) mod query;
+
+pub(crate) use json::Json;
+pub(crate) use path::Path;
+pub(crate) use query::Query;
--- a/compute_tools/src/http/extract/path.rs
+++ b/compute_tools/src/http/extract/path.rs
@@ -0,0 +1,44 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::extract::{rejection::PathRejection, FromRequestParts};
+use compute_api::responses::GenericAPIError;
+use http::{request::Parts, StatusCode};
+
+/// Custom `Path` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Path<T>(pub T);
+
+impl<S, T> FromRequestParts<S> for Path<T>
+where
+    axum::extract::Path<T>: FromRequestParts<S, Rejection = PathRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::extract::Path::<T>::from_request_parts(parts, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_ascii_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Path<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Path<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/compute_tools/src/http/extract/query.rs
+++ b/compute_tools/src/http/extract/query.rs
@@ -0,0 +1,44 @@
+use std::ops::{Deref, DerefMut};
+
+use axum::extract::{rejection::QueryRejection, FromRequestParts};
+use compute_api::responses::GenericAPIError;
+use http::{request::Parts, StatusCode};
+
+/// Custom `Query` extractor, so that we can format errors into
+/// `JsonResponse<GenericAPIError>`.
+#[derive(Debug, Clone, Copy, Default)]
+pub(crate) struct Query<T>(pub T);
+
+impl<S, T> FromRequestParts<S> for Query<T>
+where
+    axum::extract::Query<T>: FromRequestParts<S, Rejection = QueryRejection>,
+    S: Send + Sync,
+{
+    type Rejection = (StatusCode, axum::Json<GenericAPIError>);
+
+    async fn from_request_parts(parts: &mut Parts, state: &S) -> Result<Self, Self::Rejection> {
+        match axum::extract::Query::<T>::from_request_parts(parts, state).await {
+            Ok(value) => Ok(Self(value.0)),
+            Err(rejection) => Err((
+                rejection.status(),
+                axum::Json(GenericAPIError {
+                    error: rejection.body_text().to_ascii_lowercase(),
+                }),
+            )),
+        }
+    }
+}
+
+impl<T> Deref for Query<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for Query<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -1 +1,56 @@
-pub mod api;
+use axum::{body::Body, response::Response};
+use compute_api::responses::{ComputeStatus, GenericAPIError};
+use http::{header::CONTENT_TYPE, StatusCode};
+use serde::Serialize;
+use tracing::error;
+
+pub use server::launch_http_server;
+
+mod extract;
+mod routes;
+mod server;
+
+/// Convenience response builder for JSON responses
+struct JsonResponse;
+
+impl JsonResponse {
+    /// Helper for actually creating a response
+    fn create_response(code: StatusCode, body: impl Serialize) -> Response {
+        Response::builder()
+            .status(code)
+            .header(CONTENT_TYPE.as_str(), "application/json")
+            .body(Body::from(serde_json::to_string(&body).unwrap()))
+            .unwrap()
+    }
+
+    /// Create a successful error response
+    pub(self) fn success(code: StatusCode, body: impl Serialize) -> Response {
+        assert!({
+            let code = code.as_u16();
+
+            (200..300).contains(&code)
+        });
+
+        Self::create_response(code, body)
+    }
+
+    /// Create an error response
+    pub(self) fn error(code: StatusCode, error: impl ToString) -> Response {
+        assert!(code.as_u16() >= 400);
+
+        let message = error.to_string();
+        error!(message);
+
+        Self::create_response(code, &GenericAPIError { error: message })
+    }
+
+    /// Create an error response related to the compute being in an invalid state
+    pub(self) fn invalid_status(status: ComputeStatus) -> Response {
+        Self::create_response(
+            StatusCode::PRECONDITION_FAILED,
+            &GenericAPIError {
+                error: format!("invalid compute status: {status}"),
+            },
+        )
+    }
+}
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,7 +37,7 @@ paths:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

-  /metrics
+  /metrics:
    get:
      tags:
      - Info
--- a/compute_tools/src/http/routes/check_writability.rs
+++ b/compute_tools/src/http/routes/check_writability.rs
@@ -0,0 +1,20 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+
+use crate::{checker::check_writability, compute::ComputeNode, http::JsonResponse};
+
+/// Check that the compute is currently running.
+pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match check_writability(&compute).await {
+        Ok(_) => JsonResponse::success(StatusCode::OK, true),
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -0,0 +1,91 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::ConfigurationRequest,
+    responses::{ComputeStatus, ComputeStatusResponse},
+};
+use http::StatusCode;
+use tokio::task;
+use tracing::info;
+
+use crate::{
+    compute::{ComputeNode, ParsedSpec},
+    http::{extract::Json, JsonResponse},
+};
+
+// Accept spec in JSON format and request compute configuration. If anything
+// goes wrong after we set the compute status to `ConfigurationPending` and
+// update compute state with new spec, we basically leave compute in the
+// potentially wrong state. That said, it's control-plane's responsibility to
+// watch compute state after reconfiguration request and to clean restart in
+// case of errors.
+pub(in crate::http) async fn configure(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<ConfigurationRequest>,
+) -> Response {
+    if !compute.live_config_allowed {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "live configuration is not allowed for this compute node".to_string(),
+        );
+    }
+
+    let pspec = match ParsedSpec::try_from(request.spec.clone()) {
+        Ok(p) => p,
+        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
+    };
+
+    // XXX: wrap state update under lock in a code block. Otherwise, we will try
+    // to `Send` `mut state` into the spawned thread bellow, which will cause
+    // the following rustc error:
+    //
+    // error: future cannot be sent between threads safely
+    {
+        let mut state = compute.state.lock().unwrap();
+        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
+            return JsonResponse::invalid_status(state.status);
+        }
+
+        state.pspec = Some(pspec);
+        state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
+        drop(state);
+    }
+
+    // Spawn a blocking thread to wait for compute to become Running. This is
+    // needed to do not block the main pool of workers and be able to serve
+    // other requests while some particular request is waiting for compute to
+    // finish configuration.
+    let c = compute.clone();
+    let completed = task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Running {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become {}, current status: {}",
+                ComputeStatus::Running,
+                state.status
+            );
+
+            if state.status == ComputeStatus::Failed {
+                let err = state.error.as_ref().map_or("unknown error", |x| x);
+                let msg = format!("compute configuration failed: {:?}", err);
+                return Err(msg);
+            }
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap();
+
+    if let Err(e) = completed {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
+    }
+
+    // Return current compute state if everything went well.
+    let state = compute.state.lock().unwrap().clone();
+    let body = ComputeStatusResponse::from(&state);
+
+    JsonResponse::success(StatusCode::OK, body)
+}
--- a/compute_tools/src/http/routes/database_schema.rs
+++ b/compute_tools/src/http/routes/database_schema.rs
@@ -0,0 +1,34 @@
+use std::sync::Arc;
+
+use axum::{body::Body, extract::State, response::Response};
+use http::{header::CONTENT_TYPE, StatusCode};
+use serde::Deserialize;
+
+use crate::{
+    catalog::{get_database_schema, SchemaDumpError},
+    compute::ComputeNode,
+    http::{extract::Query, JsonResponse},
+};
+
+#[derive(Debug, Clone, Deserialize)]
+pub(in crate::http) struct DatabaseSchemaParams {
+    database: String,
+}
+
+/// Get a schema dump of the requested database.
+pub(in crate::http) async fn get_schema_dump(
+    params: Query<DatabaseSchemaParams>,
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    match get_database_schema(&compute, &params.database).await {
+        Ok(schema) => Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE.as_str(), "application/json")
+            .body(Body::from_stream(schema))
+            .unwrap(),
+        Err(SchemaDumpError::DatabaseDoesNotExist) => {
+            JsonResponse::error(StatusCode::NOT_FOUND, SchemaDumpError::DatabaseDoesNotExist)
+        }
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
--- a/compute_tools/src/http/routes/dbs_and_roles.rs
+++ b/compute_tools/src/http/routes/dbs_and_roles.rs
@@ -0,0 +1,16 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use http::StatusCode;
+
+use crate::{catalog::get_dbs_and_roles, compute::ComputeNode, http::JsonResponse};
+
+/// Get the databases and roles from the compute.
+pub(in crate::http) async fn get_catalog_objects(
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    match get_dbs_and_roles(&compute).await {
+        Ok(catalog_objects) => JsonResponse::success(StatusCode::OK, catalog_objects),
+        Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+    }
+}
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -0,0 +1,68 @@
+use std::sync::Arc;
+
+use axum::{
+    extract::State,
+    response::{IntoResponse, Response},
+};
+use http::StatusCode;
+use serde::Deserialize;
+
+use crate::{
+    compute::ComputeNode,
+    http::{
+        extract::{Path, Query},
+        JsonResponse,
+    },
+};
+
+#[derive(Debug, Clone, Deserialize)]
+pub(in crate::http) struct ExtensionServerParams {
+    #[serde(default)]
+    is_library: bool,
+}
+
+/// Download a remote extension.
+pub(in crate::http) async fn download_extension(
+    Path(filename): Path<String>,
+    params: Query<ExtensionServerParams>,
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    // Don't even try to download extensions if no remote storage is configured
+    if compute.ext_remote_storage.is_none() {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "remote storage is not configured",
+        );
+    }
+
+    let ext = {
+        let state = compute.state.lock().unwrap();
+        let pspec = state.pspec.as_ref().unwrap();
+        let spec = &pspec.spec;
+
+        let remote_extensions = match spec.remote_extensions.as_ref() {
+            Some(r) => r,
+            None => {
+                return JsonResponse::error(
+                    StatusCode::CONFLICT,
+                    "information about remote extensions is unavailable",
+                );
+            }
+        };
+
+        remote_extensions.get_ext(
+            &filename,
+            params.is_library,
+            &compute.build_tag,
+            &compute.pgversion,
+        )
+    };
+
+    match ext {
+        Ok((ext_name, ext_path)) => match compute.download_extension(ext_name, ext_path).await {
+            Ok(_) => StatusCode::OK.into_response(),
+            Err(e) => JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e),
+        },
+        Err(e) => JsonResponse::error(StatusCode::NOT_FOUND, e),
+    }
+}
--- a/compute_tools/src/http/routes/extensions.rs
+++ b/compute_tools/src/http/routes/extensions.rs
@@ -0,0 +1,45 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::ExtensionInstallRequest,
+    responses::{ComputeStatus, ExtensionInstallResponse},
+};
+use http::StatusCode;
+
+use crate::{
+    compute::ComputeNode,
+    http::{extract::Json, JsonResponse},
+};
+
+/// Install a extension.
+pub(in crate::http) async fn install_extension(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<ExtensionInstallRequest>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match compute
+        .install_extension(
+            &request.extension,
+            &request.database,
+            request.version.to_string(),
+        )
+        .await
+    {
+        Ok(version) => JsonResponse::success(
+            StatusCode::CREATED,
+            Some(ExtensionInstallResponse {
+                extension: request.extension.clone(),
+                version,
+            }),
+        ),
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to install extension: {e}"),
+        ),
+    }
+}
--- a/compute_tools/src/http/routes/failpoints.rs
+++ b/compute_tools/src/http/routes/failpoints.rs
@@ -0,0 +1,35 @@
+use axum::response::{IntoResponse, Response};
+use http::StatusCode;
+use tracing::info;
+use utils::failpoint_support::{apply_failpoint, ConfigureFailpointsRequest};
+
+use crate::http::{extract::Json, JsonResponse};
+
+/// Configure failpoints for testing purposes.
+pub(in crate::http) async fn configure_failpoints(
+    failpoints: Json<ConfigureFailpointsRequest>,
+) -> Response {
+    if !fail::has_failpoints() {
+        return JsonResponse::error(
+            StatusCode::PRECONDITION_FAILED,
+            "Cannot manage failpoints because neon was compiled without failpoints support",
+        );
+    }
+
+    for fp in &*failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(e) = cfg_result {
+            return JsonResponse::error(
+                StatusCode::BAD_REQUEST,
+                format!("failed to configure failpoints: {e}"),
+            );
+        }
+    }
+
+    StatusCode::OK.into_response()
+}
--- a/compute_tools/src/http/routes/grants.rs
+++ b/compute_tools/src/http/routes/grants.rs
@@ -0,0 +1,48 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::{
+    requests::SetRoleGrantsRequest,
+    responses::{ComputeStatus, SetRoleGrantsResponse},
+};
+use http::StatusCode;
+
+use crate::{
+    compute::ComputeNode,
+    http::{extract::Json, JsonResponse},
+};
+
+/// Add grants for a role.
+pub(in crate::http) async fn add_grant(
+    State(compute): State<Arc<ComputeNode>>,
+    request: Json<SetRoleGrantsRequest>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    match compute
+        .set_role_grants(
+            &request.database,
+            &request.schema,
+            &request.privileges,
+            &request.role,
+        )
+        .await
+    {
+        Ok(()) => JsonResponse::success(
+            StatusCode::CREATED,
+            Some(SetRoleGrantsResponse {
+                database: request.database.clone(),
+                schema: request.schema.clone(),
+                role: request.role.clone(),
+                privileges: request.privileges.clone(),
+            }),
+        ),
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to grant role privileges to the schema: {e}"),
+        ),
+    }
+}
--- a/compute_tools/src/http/routes/info.rs
+++ b/compute_tools/src/http/routes/info.rs
@@ -0,0 +1,11 @@
+use axum::response::Response;
+use compute_api::responses::InfoResponse;
+use http::StatusCode;
+
+use crate::http::JsonResponse;
+
+/// Get information about the physical characteristics about the compute.
+pub(in crate::http) async fn get_info() -> Response {
+    let num_cpus = num_cpus::get_physical();
+    JsonResponse::success(StatusCode::OK, &InfoResponse { num_cpus })
+}
--- a/compute_tools/src/http/routes/insights.rs
+++ b/compute_tools/src/http/routes/insights.rs
@@ -0,0 +1,18 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Collect current Postgres usage insights.
+pub(in crate::http) async fn get_insights(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    let insights = compute.collect_insights().await;
+    JsonResponse::success(StatusCode::OK, insights)
+}
--- a/compute_tools/src/http/routes/installed_extensions.rs
+++ b/compute_tools/src/http/routes/installed_extensions.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+use tokio::task;
+
+use crate::{compute::ComputeNode, http::JsonResponse, installed_extensions};
+
+/// Get a list of installed extensions.
+pub(in crate::http) async fn get_installed_extensions(
+    State(compute): State<Arc<ComputeNode>>,
+) -> Response {
+    let status = compute.get_status();
+    if status != ComputeStatus::Running {
+        return JsonResponse::invalid_status(status);
+    }
+
+    let conf = compute.get_conn_conf(None);
+    let res = task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
+        .await
+        .unwrap();
+
+    match res {
+        Ok(installed_extensions) => {
+            JsonResponse::success(StatusCode::OK, Some(installed_extensions))
+        }
+        Err(e) => JsonResponse::error(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            format!("failed to get list of installed extensions: {e}"),
+        ),
+    }
+}
--- a/compute_tools/src/http/routes/metrics.rs
+++ b/compute_tools/src/http/routes/metrics.rs
@@ -0,0 +1,32 @@
+use axum::{body::Body, response::Response};
+use http::header::CONTENT_TYPE;
+use http::StatusCode;
+use metrics::proto::MetricFamily;
+use metrics::Encoder;
+use metrics::TextEncoder;
+
+use crate::{http::JsonResponse, installed_extensions};
+
+/// Expose Prometheus metrics.
+pub(in crate::http) async fn get_metrics() -> Response {
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = installed_extensions::collect()
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
+    }
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, encoder.format_type())
+        .body(Body::from(buffer))
+        .unwrap()
+}
--- a/compute_tools/src/http/routes/metrics_json.rs
+++ b/compute_tools/src/http/routes/metrics_json.rs
@@ -0,0 +1,12 @@
+use std::sync::Arc;
+
+use axum::{extract::State, response::Response};
+use http::StatusCode;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Get startup metrics.
+pub(in crate::http) async fn get_metrics(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let metrics = compute.state.lock().unwrap().metrics.clone();
+    JsonResponse::success(StatusCode::OK, metrics)
+}
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -0,0 +1,38 @@
+use compute_api::responses::ComputeStatusResponse;
+
+use crate::compute::ComputeState;
+
+pub(in crate::http) mod check_writability;
+pub(in crate::http) mod configure;
+pub(in crate::http) mod database_schema;
+pub(in crate::http) mod dbs_and_roles;
+pub(in crate::http) mod extension_server;
+pub(in crate::http) mod extensions;
+pub(in crate::http) mod failpoints;
+pub(in crate::http) mod grants;
+pub(in crate::http) mod info;
+pub(in crate::http) mod insights;
+pub(in crate::http) mod installed_extensions;
+pub(in crate::http) mod metrics;
+pub(in crate::http) mod metrics_json;
+pub(in crate::http) mod status;
+pub(in crate::http) mod terminate;
+
+impl From<&ComputeState> for ComputeStatusResponse {
+    fn from(state: &ComputeState) -> Self {
+        ComputeStatusResponse {
+            start_time: state.start_time,
+            tenant: state
+                .pspec
+                .as_ref()
+                .map(|pspec| pspec.tenant_id.to_string()),
+            timeline: state
+                .pspec
+                .as_ref()
+                .map(|pspec| pspec.timeline_id.to_string()),
+            status: state.status,
+            last_active: state.last_active,
+            error: state.error.clone(),
+        }
+    }
+}
--- a/compute_tools/src/http/routes/status.rs
+++ b/compute_tools/src/http/routes/status.rs
@@ -0,0 +1,14 @@
+use std::{ops::Deref, sync::Arc};
+
+use axum::{extract::State, http::StatusCode, response::Response};
+use compute_api::responses::ComputeStatusResponse;
+
+use crate::{compute::ComputeNode, http::JsonResponse};
+
+/// Retrieve the state of the comute.
+pub(in crate::http) async fn get_status(State(compute): State<Arc<ComputeNode>>) -> Response {
+    let state = compute.state.lock().unwrap();
+    let body = ComputeStatusResponse::from(state.deref());
+
+    JsonResponse::success(StatusCode::OK, body)
+}
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -0,0 +1,58 @@
+use std::sync::Arc;
+
+use axum::{
+    extract::State,
+    response::{IntoResponse, Response},
+};
+use compute_api::responses::ComputeStatus;
+use http::StatusCode;
+use tokio::task;
+use tracing::info;
+
+use crate::{
+    compute::{forward_termination_signal, ComputeNode},
+    http::JsonResponse,
+};
+
+/// Terminate the compute.
+pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return StatusCode::CREATED.into_response();
+        }
+
+        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
+            return JsonResponse::invalid_status(state.status);
+        }
+
+        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
+        drop(state);
+    }
+
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become {}, current status: {:?}",
+                ComputeStatus::Terminated,
+                state.status
+            );
+        }
+    })
+    .await
+    .unwrap();
+
+    info!("terminated Postgres");
+
+    StatusCode::OK.into_response()
+}
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -0,0 +1,155 @@
+use std::{
+    net::{IpAddr, Ipv6Addr, SocketAddr},
+    sync::Arc,
+    thread,
+    time::Duration,
+};
+
+use anyhow::Result;
+use axum::{
+    extract::Request,
+    middleware::{self, Next},
+    response::{IntoResponse, Response},
+    routing::{get, post},
+    Router,
+};
+use http::StatusCode;
+use tokio::net::TcpListener;
+use tower::ServiceBuilder;
+use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
+use tracing::{debug, error, info, Span};
+use uuid::Uuid;
+
+use super::routes::{
+    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
+    grants, info as info_route, insights, installed_extensions, metrics, metrics_json, status,
+    terminate,
+};
+use crate::compute::ComputeNode;
+
+async fn handle_404() -> Response {
+    StatusCode::NOT_FOUND.into_response()
+}
+
+const X_REQUEST_ID: &str = "x-request-id";
+
+/// This middleware function allows compute_ctl to generate its own request ID
+/// if one isn't supplied. The control plane will always send one as a UUID. The
+/// neon Postgres extension on the other hand does not send one.
+async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
+    let headers = request.headers_mut();
+
+    if headers.get(X_REQUEST_ID).is_none() {
+        headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
+    }
+
+    next.run(request).await
+}
+
+/// Run the HTTP server and wait on it forever.
+#[tokio::main]
+async fn serve(port: u16, compute: Arc<ComputeNode>) {
+    let mut app = Router::new()
+        .route("/check_writability", post(check_writability::is_writable))
+        .route("/configure", post(configure::configure))
+        .route("/database_schema", get(database_schema::get_schema_dump))
+        .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+        .route(
+            "/extension_server/{*filename}",
+            post(extension_server::download_extension),
+        )
+        .route("/extensions", post(extensions::install_extension))
+        .route("/grants", post(grants::add_grant))
+        .route("/info", get(info_route::get_info))
+        .route("/insights", get(insights::get_insights))
+        .route(
+            "/installed_extensions",
+            get(installed_extensions::get_installed_extensions),
+        )
+        .route("/metrics", get(metrics::get_metrics))
+        .route("/metrics.json", get(metrics_json::get_metrics))
+        .route("/status", get(status::get_status))
+        .route("/terminate", post(terminate::terminate))
+        .fallback(handle_404)
+        .layer(
+            ServiceBuilder::new()
+                // Add this middleware since we assume the request ID exists
+                .layer(middleware::from_fn(maybe_add_request_id_header))
+                .layer(
+                    TraceLayer::new_for_http()
+                        .on_request(|request: &http::Request<_>, _span: &Span| {
+                            let request_id = request
+                                .headers()
+                                .get(X_REQUEST_ID)
+                                .unwrap()
+                                .to_str()
+                                .unwrap();
+
+                            match request.uri().path() {
+                                "/metrics" => {
+                                    debug!(%request_id, "{} {}", request.method(), request.uri())
+                                }
+                                _ => info!(%request_id, "{} {}", request.method(), request.uri()),
+                            };
+                        })
+                        .on_response(
+                            |response: &http::Response<_>, latency: Duration, _span: &Span| {
+                                let request_id = response
+                                    .headers()
+                                    .get(X_REQUEST_ID)
+                                    .unwrap()
+                                    .to_str()
+                                    .unwrap();
+
+                                info!(
+                                    %request_id,
+                                    code = response.status().as_u16(),
+                                    latency = latency.as_millis()
+                                )
+                            },
+                        ),
+                )
+                .layer(PropagateRequestIdLayer::x_request_id()),
+        )
+        .with_state(compute);
+
+    // Add in any testing support
+    if cfg!(feature = "testing") {
+        use super::routes::failpoints;
+
+        app = app.route("/failpoints", post(failpoints::configure_failpoints))
+    }
+
+    // This usually binds to both IPv4 and IPv6 on Linux, see
+    // https://github.com/rust-lang/rust/pull/34440 for more information
+    let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
+    let listener = match TcpListener::bind(&addr).await {
+        Ok(listener) => listener,
+        Err(e) => {
+            error!(
+                "failed to bind the compute_ctl HTTP server to port {}: {}",
+                port, e
+            );
+            return;
+        }
+    };
+
+    if let Ok(local_addr) = listener.local_addr() {
+        info!("compute_ctl HTTP server listening on {}", local_addr);
+    } else {
+        info!("compute_ctl HTTP server listening on port {}", port);
+    }
+
+    if let Err(e) = axum::serve(listener, app).await {
+        error!("compute_ctl HTTP server error: {}", e);
+    }
+}
+
+/// Launch a separate HTTP server thread and return its `JoinHandle`.
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let state = Arc::clone(state);
+
+    Ok(thread::Builder::new()
+        .name("http-server".into())
+        .spawn(move || serve(port, state))?)
+}
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -3,8 +3,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]

-extern crate hyper0 as hyper;
-
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -1,6 +1,6 @@
 use anyhow::{Context, Result};
 use fail::fail_point;
-use postgres::Client;
+use postgres::{Client, Transaction};
 use tracing::info;

 /// Runs a series of migrations on a target database
@@ -20,11 +20,9 @@ impl<'m> MigrationRunner<'m> {

    /// Get the current value neon_migration.migration_id
    fn get_migration_id(&mut self) -> Result<i64> {
-        let query = "SELECT id FROM neon_migration.migration_id";
        let row = self
            .client
-            .query_one(query, &[])
-            .context("run_migrations get migration_id")?;
+            .query_one("SELECT id FROM neon_migration.migration_id", &[])?;

        Ok(row.get::<&str, i64>("id"))
    }
@@ -34,7 +32,7 @@ impl<'m> MigrationRunner<'m> {
    /// This function has a fail point called compute-migration, which can be
    /// used if you would like to fail the application of a series of migrations
    /// at some point.
-    fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
+    fn update_migration_id(txn: &mut Transaction, migration_id: i64) -> Result<()> {
        // We use this fail point in order to check that failing in the
        // middle of applying a series of migrations fails in an expected
        // manner
@@ -55,12 +53,11 @@ impl<'m> MigrationRunner<'m> {
            }
        }

-        self.client
-            .query(
-                "UPDATE neon_migration.migration_id SET id = $1",
-                &[&migration_id],
-            )
-            .context("run_migrations update id")?;
+        txn.query(
+            "UPDATE neon_migration.migration_id SET id = $1",
+            &[&migration_id],
+        )
+        .with_context(|| format!("update neon_migration.migration_id to {migration_id}"))?;

        Ok(())
    }
@@ -81,53 +78,50 @@ impl<'m> MigrationRunner<'m> {
        Ok(())
    }

-    /// Run the configrured set of migrations
+    /// Run an individual migration
+    fn run_migration(txn: &mut Transaction, migration_id: i64, migration: &str) -> Result<()> {
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", migration_id);
+
+            // Even though we are skipping the migration, updating the
+            // migration ID should help keep logic easy to understand when
+            // trying to understand the state of a cluster.
+            Self::update_migration_id(txn, migration_id)?;
+        } else {
+            info!("Running migration id={}:\n{}\n", migration_id, migration);
+
+            txn.simple_query(migration)
+                .with_context(|| format!("apply migration {migration_id}"))?;
+
+            Self::update_migration_id(txn, migration_id)?;
+        }
+
+        Ok(())
+    }
+
+    /// Run the configured set of migrations
    pub fn run_migrations(mut self) -> Result<()> {
-        self.prepare_database()?;
+        self.prepare_database()
+            .context("prepare database to handle migrations")?;

        let mut current_migration = self.get_migration_id()? as usize;
        while current_migration < self.migrations.len() {
-            macro_rules! migration_id {
-                ($cm:expr) => {
-                    ($cm + 1) as i64
-                };
-            }
+            // The index lags the migration ID by 1, so the current migration
+            // ID is also the next index
+            let migration_id = (current_migration + 1) as i64;

-            let migration = self.migrations[current_migration];
+            let mut txn = self
+                .client
+                .transaction()
+                .with_context(|| format!("begin transaction for migration {migration_id}"))?;

-            if migration.starts_with("-- SKIP") {
-                info!("Skipping migration id={}", migration_id!(current_migration));
+            Self::run_migration(&mut txn, migration_id, self.migrations[current_migration])
+                .with_context(|| format!("running migration {migration_id}"))?;

-                // Even though we are skipping the migration, updating the
-                // migration ID should help keep logic easy to understand when
-                // trying to understand the state of a cluster.
-                self.update_migration_id(migration_id!(current_migration))?;
-            } else {
-                info!(
-                    "Running migration id={}:\n{}\n",
-                    migration_id!(current_migration),
-                    migration
-                );
+            txn.commit()
+                .with_context(|| format!("commit transaction for migration {migration_id}"))?;

-                self.client
-                    .simple_query("BEGIN")
-                    .context("begin migration")?;
-
-                self.client.simple_query(migration).with_context(|| {
-                    format!(
-                        "run_migrations migration id={}",
-                        migration_id!(current_migration)
-                    )
-                })?;
-
-                self.update_migration_id(migration_id!(current_migration))?;
-
-                self.client
-                    .simple_query("COMMIT")
-                    .context("commit migration")?;
-
-                info!("Finished migration id={}", migration_id!(current_migration));
-            }
+            info!("Finished migration id={}", migration_id);

            current_migration += 1;
        }
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -47,6 +47,7 @@ pub enum PerDatabasePhase {
    DeleteDBRoleReferences,
    ChangeSchemaPerms,
    HandleAnonExtension,
+    DropLogicalSubscriptions,
 }

 #[derive(Clone, Debug)]
@@ -57,11 +58,13 @@ pub enum ApplySpecPhase {
    CreateAndAlterRoles,
    RenameAndDeleteDatabases,
    CreateAndAlterDatabases,
+    CreateSchemaNeon,
    RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
    HandleOtherExtensions,
    HandleNeonExtension,
    CreateAvailabilityCheck,
    DropRoles,
+    FinalizeDropLogicalSubscriptions,
 }

 pub struct Operation {
@@ -74,7 +77,7 @@ pub struct MutableApplyContext {
    pub dbs: HashMap<String, Database>,
 }

-/// Appply the operations that belong to the given spec apply phase.
+/// Apply the operations that belong to the given spec apply phase.
 ///
 /// Commands within a single phase are executed in order of Iterator yield.
 /// Commands of ApplySpecPhase::RunInEachDatabase will execute in the database
@@ -326,13 +329,12 @@ async fn get_operations<'a>(

                            // Use FORCE to drop database even if there are active connections.
                            // We run this from `cloud_admin`, so it should have enough privileges.
+                            //
                            // NB: there could be other db states, which prevent us from dropping
                            // the database. For example, if db is used by any active subscription
                            // or replication slot.
-                            // TODO: deal with it once we allow logical replication. Proper fix should
-                            // involve returning an error code to the control plane, so it could
-                            // figure out that this is a non-retryable error, return it to the user
-                            // and fail operation permanently.
+                            // Such cases are handled in the DropLogicalSubscriptions
+                            // phase. We do all the cleanup before actually dropping the database.
                            let drop_db_query: String = format!(
                                "DROP DATABASE IF EXISTS {} WITH (FORCE)",
                                &op.name.pg_quote()
@@ -442,8 +444,38 @@ async fn get_operations<'a>(

            Ok(Box::new(operations))
        }
+        ApplySpecPhase::CreateSchemaNeon => Ok(Box::new(once(Operation {
+            query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
+            comment: Some(String::from(
+                "create schema for neon extension and utils tables",
+            )),
+        }))),
        ApplySpecPhase::RunInEachDatabase { db, subphase } => {
            match subphase {
+                PerDatabasePhase::DropLogicalSubscriptions => {
+                    match &db {
+                        DB::UserDB(db) => {
+                            let drop_subscription_query: String = format!(
+                                include_str!("sql/drop_subscriptions.sql"),
+                                datname_str = escape_literal(&db.name),
+                            );
+
+                            let operations = vec![Operation {
+                                query: drop_subscription_query,
+                                comment: Some(format!(
+                                    "optionally dropping subscriptions for DB {}",
+                                    db.name,
+                                )),
+                            }]
+                            .into_iter();
+
+                            Ok(Box::new(operations))
+                        }
+                        // skip this cleanup for the system databases
+                        // because users can't drop them
+                        DB::SystemDB => Ok(Box::new(empty())),
+                    }
+                }
                PerDatabasePhase::DeleteDBRoleReferences => {
                    let ctx = ctx.read().await;

@@ -474,7 +506,19 @@ async fn get_operations<'a>(
                                        ),
                                        comment: None,
                                    },
+                                    // Revoke some potentially blocking privileges (Neon-specific currently)
+                                    Operation {
+                                        query: format!(
+                                            include_str!("sql/pre_drop_role_revoke_privileges.sql"),
+                                            role_name = quoted,
+                                        ),
+                                        comment: None,
+                                    },
                                    // This now will only drop privileges of the role
+                                    // TODO: this is obviously not 100% true because of the above case,
+                                    // there could be still some privileges that are not revoked. Maybe this
+                                    // only drops privileges that were granted *by this* role, not *to this* role,
+                                    // but this has to be checked.
                                    Operation {
                                        query: format!("DROP OWNED BY {}", quoted),
                                        comment: None,
@@ -630,10 +674,6 @@ async fn get_operations<'a>(
        }
        ApplySpecPhase::HandleNeonExtension => {
            let operations = vec![
-                Operation {
-                    query: String::from("CREATE SCHEMA IF NOT EXISTS neon"),
-                    comment: Some(String::from("init: add schema for extension")),
-                },
                Operation {
                    query: String::from("CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon"),
                    comment: Some(String::from(
@@ -676,5 +716,9 @@ async fn get_operations<'a>(

            Ok(Box::new(operations))
        }
+        ApplySpecPhase::FinalizeDropLogicalSubscriptions => Ok(Box::new(once(Operation {
+            query: String::from(include_str!("sql/finalize_drop_subscriptions.sql")),
+            comment: None,
+        }))),
    }
 }
--- a/compute_tools/src/sql/drop_subscriptions.sql
+++ b/compute_tools/src/sql/drop_subscriptions.sql
@@ -0,0 +1,11 @@
+DO $$
+DECLARE
+    subname TEXT;
+BEGIN
+    FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
+        EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
+        EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
+        EXECUTE format('DROP SUBSCRIPTION %I;', subname);
+    END LOOP;
+END;
+$$;
--- a/compute_tools/src/sql/finalize_drop_subscriptions.sql
+++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql
@@ -0,0 +1,21 @@
+DO $$
+BEGIN
+    IF NOT EXISTS(
+        SELECT 1
+        FROM pg_catalog.pg_tables
+        WHERE tablename = 'drop_subscriptions_done'
+        AND schemaname = 'neon'
+    )
+    THEN
+        CREATE TABLE neon.drop_subscriptions_done
+        (id serial primary key, timeline_id text);
+    END IF;
+
+    -- preserve the timeline_id of the last drop_subscriptions run
+    -- to ensure that the cleanup of a timeline is executed only once.
+    -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
+    INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
+    ON CONFLICT (id) DO UPDATE
+    SET timeline_id = current_setting('neon.timeline_id');
+END
+$$
--- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
+++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
@@ -0,0 +1,28 @@
+SET SESSION ROLE neon_superuser;
+
+DO $$
+DECLARE
+    schema TEXT;
+    revoke_query TEXT;
+BEGIN
+    FOR schema IN
+        SELECT schema_name
+        FROM information_schema.schemata
+        -- So far, we only had issues with 'public' schema. Probably, because we do some additional grants,
+        -- e.g., make DB owner the owner of 'public' schema automatically (when created via API).
+        -- See https://github.com/neondatabase/cloud/issues/13582 for the context.
+        -- Still, keep the loop because i) it efficiently handles the case when there is no 'public' schema,
+        -- ii) it's easy to add more schemas to the list if needed.
+        WHERE schema_name IN ('public')
+    LOOP
+        revoke_query := format(
+            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
+            schema
+        );
+
+        EXECUTE revoke_query;
+    END LOOP;
+END;
+$$;
+
+RESET ROLE;
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -1,6 +1,10 @@
-# Control Plane and Neon Local
+# Local Development Control Plane (`neon_local`)

-This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
+This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.  This is a convenience to invoke
+the `neon_local` binary.
+
+**Note**: this is a dev/test tool -- a minimal control plane suitable for testing
+code changes locally, but not suitable for running production systems.

 ## Example: Start with Postgres 16

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1357,6 +1357,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                args.pg_version,
                mode,
                !args.update_catalog,
+                false,
            )?;
        }
        EndpointCmd::Start(args) => {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -62,7 +62,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
 use crate::storage_controller::StorageController;

-use compute_api::responses::{ComputeState, ComputeStatus};
+use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
 use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
@@ -76,6 +76,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
+    drop_subscriptions_before_start: bool,
    features: Vec<ComputeFeature>,
 }

@@ -143,6 +144,7 @@ impl ComputeControlPlane {
        pg_version: u32,
        mode: ComputeMode,
        skip_pg_catalog_updates: bool,
+        drop_subscriptions_before_start: bool,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -162,6 +164,7 @@ impl ComputeControlPlane {
            // with this we basically test a case of waking up an idle compute, where
            // we also skip catalog updates in the cloud.
            skip_pg_catalog_updates,
+            drop_subscriptions_before_start,
            features: vec![],
        });

@@ -177,6 +180,7 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates,
+                drop_subscriptions_before_start,
                features: vec![],
            })?,
        )?;
@@ -240,6 +244,7 @@ pub struct Endpoint {
    // Optimizations
    skip_pg_catalog_updates: bool,

+    drop_subscriptions_before_start: bool,
    // Feature flags
    features: Vec<ComputeFeature>,
 }
@@ -291,6 +296,7 @@ impl Endpoint {
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
            features: conf.features,
        })
    }
@@ -625,6 +631,7 @@ impl Endpoint {
            shard_stripe_size: Some(shard_stripe_size),
            local_proxy_config: None,
            reconfigure_concurrency: 1,
+            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -739,7 +746,7 @@ impl Endpoint {
    }

    // Call the /status HTTP API
-    pub async fn get_status(&self) -> Result<ComputeState> {
+    pub async fn get_status(&self) -> Result<ComputeStatusResponse> {
        let client = reqwest::Client::new();

        let response = client
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -483,7 +483,6 @@ impl LocalEnv {
            .iter()
            .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
            .map(|&(_, timeline_id)| timeline_id)
-            .map(TimelineId::from)
    }

    pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -352,6 +352,21 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("Failed to parse 'compaction_algorithm' json")?,
+            l0_flush_delay_threshold: settings
+                .remove("l0_flush_delay_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
+            l0_flush_wait_upload: settings
+                .remove("l0_flush_wait_upload")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?,
+            l0_flush_stall_threshold: settings
+                .remove("l0_flush_stall_threshold")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Failed to parse 'l0_flush_stall_threshold' as an integer")?,
            gc_horizon: settings
                .remove("gc_horizon")
                .map(|x| x.parse::<u64>())
@@ -418,6 +433,26 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("parse `wal_receiver_protocol_override` from json")?,
+            rel_size_v2_enabled: settings
+                .remove("rel_size_v2_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'rel_size_v2_enabled' as bool")?,
+            gc_compaction_enabled: settings
+                .remove("gc_compaction_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_enabled' as bool")?,
+            gc_compaction_initial_threshold_kb: settings
+                .remove("gc_compaction_initial_threshold_kb")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_initial_threshold_kb' as integer")?,
+            gc_compaction_ratio_percent: settings
+                .remove("gc_compaction_ratio_percent")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -822,10 +822,7 @@ impl StorageController {
        self.dispatch(
            Method::PUT,
            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id,
-            }),
+            Some(TenantShardMigrateRequest { node_id }),
        )
        .await
    }
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,12 +1,17 @@
 use futures::StreamExt;
-use std::{str::FromStr, time::Duration};
+use std::{
+    collections::{HashMap, HashSet},
+    str::FromStr,
+    time::Duration,
+};

 use clap::{Parser, Subcommand};
 use pageserver_api::{
    controller_api::{
        AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
-        TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
+        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
+        TenantPolicyRequest,
    },
    models::{
        EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -112,6 +117,13 @@ enum Command {
        #[arg(long)]
        node: NodeId,
    },
+    /// Migrate the secondary location for a tenant shard to a specific pageserver.
+    TenantShardMigrateSecondary {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        node: NodeId,
+    },
    /// Cancel any ongoing reconciliation for this shard
    TenantShardCancelReconcile {
        #[arg(long)]
@@ -146,6 +158,12 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
+    TenantSetPreferredAz {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        preferred_az: Option<String>,
+    },
    /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
    /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
    TenantDrop {
@@ -214,6 +232,13 @@ enum Command {
    },
    /// List safekeepers known to the storage controller
    Safekeepers {},
+    /// Set the scheduling policy of the specified safekeeper
+    SafekeeperScheduling {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        scheduling_policy: SkSchedulingPolicyArg,
+    },
 }

 #[derive(Parser)]
@@ -266,6 +291,17 @@ impl FromStr for PlacementPolicyArg {
    }
 }

+#[derive(Debug, Clone)]
+struct SkSchedulingPolicyArg(SkSchedulingPolicy);
+
+impl FromStr for SkSchedulingPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        SkSchedulingPolicy::from_str(s).map(Self)
+    }
+}
+
 #[derive(Debug, Clone)]
 struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);

@@ -395,11 +431,12 @@ async fn main() -> anyhow::Result<()> {
            resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));

            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
+            table.set_header(["Id", "Hostname", "AZ", "Scheduling", "Availability"]);
            for node in resp {
                table.add_row([
                    format!("{}", node.id),
                    node.listen_http_addr,
+                    node.availability_zone_id,
                    format!("{:?}", node.scheduling),
                    format!("{:?}", node.availability),
                ]);
@@ -459,33 +496,65 @@ async fn main() -> anyhow::Result<()> {
            println!("{table}");
        }
        Command::Tenants { node_id: None } => {
-            let mut resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-
-            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
-
+            // Set up output formatting
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
+                "Preferred AZ",
                "ShardCount",
                "StripeSize",
                "Placement",
                "Scheduling",
            ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
+
+            // Pagination loop over listing API
+            let mut start_after = None;
+            const LIMIT: usize = 1000;
+            loop {
+                let path = match start_after {
+                    None => format!("control/v1/tenant?limit={LIMIT}"),
+                    Some(start_after) => {
+                        format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}")
+                    }
+                };
+
+                let resp = storcon_client
+                    .dispatch::<(), Vec<TenantDescribeResponse>>(Method::GET, path, None)
+                    .await?;
+
+                if resp.is_empty() {
+                    // End of data reached
+                    break;
+                }
+
+                // Give some visual feedback while we're building up the table (comfy_table doesn't have
+                // streaming output)
+                if resp.len() >= LIMIT {
+                    eprint!(".");
+                }
+
+                start_after = Some(resp.last().unwrap().tenant_id);
+
+                for tenant in resp {
+                    let shard_zero = tenant.shards.into_iter().next().unwrap();
+                    table.add_row([
+                        format!("{}", tenant.tenant_id),
+                        shard_zero
+                            .preferred_az_id
+                            .as_ref()
+                            .cloned()
+                            .unwrap_or("".to_string()),
+                        format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
+                        format!("{:?}", tenant.stripe_size),
+                        format!("{:?}", tenant.policy),
+                        format!("{:?}", shard_zero.scheduling_policy),
+                    ]);
+                }
+            }
+
+            // Terminate progress dots
+            if table.row_count() > LIMIT {
+                eprint!("");
            }

            println!("{table}");
@@ -540,10 +609,7 @@ async fn main() -> anyhow::Result<()> {
            tenant_shard_id,
            node,
        } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
+            let req = TenantShardMigrateRequest { node_id: node };

            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -553,6 +619,20 @@ async fn main() -> anyhow::Result<()> {
                )
                .await?;
        }
+        Command::TenantShardMigrateSecondary {
+            tenant_shard_id,
+            node,
+        } => {
+            let req = TenantShardMigrateRequest { node_id: node };
+
+            storcon_client
+                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/migrate_secondary"),
+                    Some(req),
+                )
+                .await?;
+        }
        Command::TenantShardCancelReconcile { tenant_shard_id } => {
            storcon_client
                .dispatch::<(), ()>(
@@ -596,6 +676,19 @@ async fn main() -> anyhow::Result<()> {
                    None,
                )
                .await?;
+
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+            let nodes = nodes
+                .into_iter()
+                .map(|n| (n.id, n))
+                .collect::<HashMap<_, _>>();
+
            println!("Tenant {tenant_id}");
            let mut table = comfy_table::Table::new();
            table.add_row(["Policy", &format!("{:?}", policy)]);
@@ -604,7 +697,14 @@ async fn main() -> anyhow::Result<()> {
            println!("{table}");
            println!("Shards:");
            let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
+            table.set_header([
+                "Shard",
+                "Attached",
+                "Attached AZ",
+                "Secondary",
+                "Last error",
+                "status",
+            ]);
            for shard in shards {
                let secondary = shard
                    .node_secondary
@@ -627,11 +727,18 @@ async fn main() -> anyhow::Result<()> {
                }
                let status = status_parts.join(",");

+                let attached_node = shard
+                    .node_attached
+                    .as_ref()
+                    .map(|id| nodes.get(id).expect("Shard references nonexistent node"));
+
                table.add_row([
                    format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
+                    attached_node
+                        .map(|n| format!("{} ({})", n.listen_http_addr, n.id))
+                        .unwrap_or(String::new()),
+                    attached_node
+                        .map(|n| n.availability_zone_id.clone())
                        .unwrap_or(String::new()),
                    secondary,
                    shard.last_error,
@@ -640,6 +747,66 @@ async fn main() -> anyhow::Result<()> {
            }
            println!("{table}");
        }
+        Command::TenantSetPreferredAz {
+            tenant_id,
+            preferred_az,
+        } => {
+            // First learn about the tenant's shards
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            // Learn about nodes to validate the AZ ID
+            let nodes = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            if let Some(preferred_az) = &preferred_az {
+                let azs = nodes
+                    .into_iter()
+                    .map(|n| (n.availability_zone_id))
+                    .collect::<HashSet<_>>();
+                if !azs.contains(preferred_az) {
+                    anyhow::bail!(
+                        "AZ {} not found on any node: known AZs are: {:?}",
+                        preferred_az,
+                        azs
+                    );
+                }
+            } else {
+                // Make it obvious to the user that since they've omitted an AZ, we're clearing it
+                eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
+            }
+
+            // Construct a request that modifies all the tenant's shards
+            let req = ShardsPreferredAzsRequest {
+                preferred_az_ids: describe_response
+                    .shards
+                    .into_iter()
+                    .map(|s| {
+                        (
+                            s.tenant_shard_id,
+                            preferred_az.clone().map(AvailabilityZone),
+                        )
+                    })
+                    .collect(),
+            };
+            storcon_client
+                .dispatch::<ShardsPreferredAzsRequest, ()>(
+                    Method::PUT,
+                    "control/v1/preferred_azs".to_string(),
+                    Some(req),
+                )
+                .await?;
+        }
        Command::TenantWarmup { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
@@ -915,10 +1082,7 @@ async fn main() -> anyhow::Result<()> {
                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                Method::PUT,
                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
+                                Some(TenantShardMigrateRequest { node_id: mv.to }),
                            )
                            .await
                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
@@ -1035,7 +1199,15 @@ async fn main() -> anyhow::Result<()> {
            resp.sort_by(|a, b| a.id.cmp(&b.id));

            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Version", "Host", "Port", "Http Port", "AZ Id"]);
+            table.set_header([
+                "Id",
+                "Version",
+                "Host",
+                "Port",
+                "Http Port",
+                "AZ Id",
+                "Scheduling",
+            ]);
            for sk in resp {
                table.add_row([
                    format!("{}", sk.id),
@@ -1043,11 +1215,29 @@ async fn main() -> anyhow::Result<()> {
                    sk.host,
                    format!("{}", sk.port),
                    format!("{}", sk.http_port),
-                    sk.availability_zone_id.to_string(),
+                    sk.availability_zone_id.clone(),
+                    String::from(sk.scheduling_policy),
                ]);
            }
            println!("{table}");
        }
+        Command::SafekeeperScheduling {
+            node_id,
+            scheduling_policy,
+        } => {
+            let scheduling_policy = scheduling_policy.0;
+            storcon_client
+                .dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
+                    Method::POST,
+                    format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
+                    Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
+                )
+                .await?;
+            println!(
+                "Scheduling policy of {node_id} set to {}",
+                String::from(scheduling_policy)
+            );
+        }
    }

    Ok(())
--- a/deny.toml
+++ b/deny.toml
@@ -41,8 +41,8 @@ allow = [
    "MIT",
    "MPL-2.0",
    "OpenSSL",
-    "Unicode-DFS-2016",
    "Unicode-3.0",
+    "Zlib",
 ]
 confidence-threshold = 0.8
 exceptions = [
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -10,10 +10,7 @@ USER root
 RUN apt-get update &&       \
    apt-get install -y curl \
                       jq   \
-                       python3-pip \
                       netcat-openbsd
-#Faker is required for the pg_anon test
-RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
 #This is required for the pg_hintplan test
 RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 

--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -150,8 +150,8 @@ services:
        - REPOSITORY=${REPOSITORY:-neondatabase}
        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
        - TAG=${TAG:-latest}
-        - http_proxy=$http_proxy
-        - https_proxy=$https_proxy
+        - http_proxy=${http_proxy:-}
+        - https_proxy=${https_proxy:-}
    environment:
      - PG_VERSION=${PG_VERSION:-16}
      #- RUST_BACKTRACE=1
@@ -185,6 +185,8 @@ services:
  neon-test-extensions:
    profiles: ["test-extensions"]
    image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
+    environment:
+      - PGPASSWORD=cloud_admin
    entrypoint:
      - "/bin/bash"
      - "-c"
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -18,14 +18,10 @@ cd $(dirname $0)
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
 TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
-: ${http_proxy:=}
-: ${https_proxy:=}
-export http_proxy https_proxy

 cleanup() {
    echo "show container information"
    docker ps
-    docker compose --profile test-extensions -f $COMPOSE_FILE logs
    echo "stop containers..."
    docker compose --profile test-extensions -f $COMPOSE_FILE down
 }
@@ -35,12 +31,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
    echo "clean up containers if exists"
    cleanup
    PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    # The support of pg_anon not yet added to PG17, so we have to add the corresponding option for other PG versions
-    if [ "${pg_version}" -ne 17 ]; then
-      SPEC_PATH="compute_wrapper/var/db/postgres/specs"
-      mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
-      jq '.cluster.settings += [{"name": "session_preload_libraries","value": "anon","vartype": "string"}]' "${SPEC_PATH}/spec.bak" > "${SPEC_PATH}/spec.json"
-    fi
    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d

    echo "wait until the compute is ready. timeout after 60s. "
@@ -50,7 +40,6 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        cnt=`expr $cnt + 3`
        if [ $cnt -gt 60 ]; then
            echo "timeout before the compute is ready."
-            cleanup
            exit 1
        fi
        if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
@@ -62,36 +51,19 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
    done

    if [ $pg_version -ge 16 ]; then
-        echo Enabling trust connection
-        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
-        echo Adding postgres role
-        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
        echo Adding dummy config
        docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
-        # This block is required for the pg_anon extension test.
-        # The test assumes that it is running on the same host with the postgres engine.
-        # In our case it's not true, that's why we are copying files to the compute node
+        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
        TMPDIR=$(mktemp -d)
-        # Add support for pg_anon for pg_v16
-        if [ $pg_version -ne 17 ]; then
-          docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-          echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-          docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
-        rm -rf $TMPDIR
-        fi
-        TMPDIR=$(mktemp -d)
-        # The following block does the same for the pg_hintplan test
        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
        rm -rf $TMPDIR
        # We are running tests now
-        if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+        if ! docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
        then
-            cleanup
-        else
            FAILED=$(tail -1 testout.txt)
            for d in $FAILED
            do
@@ -101,13 +73,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
                cat $d/regression.out $d/regression.diffs || true
            done
        rm -rf $FAILED
-        cleanup
        exit 1
        fi
    fi
-    cleanup
-    # Restore the original spec.json
-    if [ "$pg_version" -ne 17 ]; then
-      mv "$SPEC_PATH/spec.bak" "$SPEC_PATH/spec.json"
-    fi
 done
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -7,7 +7,10 @@ LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
 for d in ${LIST}
 do
       [ -d "${d}" ] || continue
-    psql -c "select 1" >/dev/null || break
+       if ! psql -w -c "select 1" >/dev/null; then
+          FAILED="${d} ${FAILED}"
+          break
+       fi
       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
 done
 [ -z "${FAILED}" ] && exit 0
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,15 +7,11 @@ Currently we build two main images:
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
 - [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).

-And additional intermediate image:
-
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
-
 ## Build pipeline

 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-node-v17` (and -16, -v15, -v14)

 2. `neondatabase/neon`

--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it
 refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
 response it sends its current configuration generation to let walproposer know.

-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
 accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
 current one and ignores it otherwise. In any case it replies with
 ```
@@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not
 define consensus members. Instead, on start walproposer tracks highest
 configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
 from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
-establishes this configuration as its own and moves to voting. 
+establishes this configuration as its own and moves to voting.

 It should stop talking to safekeepers not listed in the configuration at this
 point, though it is not unsafe to continue doing so.
@@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
 The following algorithm can be executed anywhere having access to configuration
 storage and safekeepers. It is safe to interrupt / restart it and run multiple
 instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<NodeId>` as input. 
+progress then. It accepts `desired_set: Vec<NodeId>` as input.

 Algorithm will refuse to make the change if it encounters previous interrupted
 change attempt, but in this case it will try to finish it.
@@ -140,7 +140,7 @@ storage are reachable.
   safe. Failed CAS aborts the procedure.
 4) Call `PUT` `configuration` on safekeepers from the current set,
   delivering them `joint_conf`. Collecting responses from majority is required
-   to proceed. If any response returned generation higher than 
+   to proceed. If any response returned generation higher than
   `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
   max `<last_log_term, flush_lsn>` among responses and establish it as
   (in memory) `sync_position`. Also choose max `term` and establish it as (in
@@ -149,49 +149,49 @@ storage are reachable.
   without ack from the new set. Similarly, we'll bump term on new majority
   to `sync_term` so that two computes with the same term are never elected.
 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
-   doesn't exist yet by doing `pull_timeline` from the majority of the 
+   doesn't exist yet by doing `pull_timeline` from the majority of the
   current set. Doing that on majority of `new_sk_set` is enough to
   proceed, but it is reasonable to ensure that all `new_sk_set` members
   are initialized -- if some of them are down why are we migrating there?
-5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
+5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
   Success on majority is enough.
 6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
   delivering them `joint_conf` and collecting their positions. This will
-   switch them to the `joint_conf` which generally won't be needed 
+   switch them to the `joint_conf` which generally won't be needed
   because `pull_timeline` already includes it and plus additionally would be
   broadcast by compute. More importantly, we may proceed to the next step
-   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path no waiting is not needed because 
+   only when `<last_log_term, flush_lsn>` on the majority of the new set reached
+   `sync_position`. Similarly, on the happy path no waiting is not needed because
   `pull_timeline` already includes it. However, we should double
    check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
-   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence.
+7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
+   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
   storage under one more CAS.
 8) Call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `new_conf`. It is enough to deliver it to the majority 
+   delivering them `new_conf`. It is enough to deliver it to the majority
   of the new set; the rest can be updated by compute.

 I haven't put huge effort to make the description above very precise, because it
 is natural language prone to interpretations anyway. Instead I'd like to make TLA+
 spec of it.

-Description above focuses on safety. To make the flow practical and live, here a few more 
+Description above focuses on safety. To make the flow practical and live, here a few more
 considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
+1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
  step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
   it is safe to rollback to the old conf with one more CAS.
-3) On step 4 timeline might be already created on members of the new set for various reasons; 
+3) On step 4 timeline might be already created on members of the new set for various reasons;
   the simplest is the procedure restart. There are more complicated scenarious like mentioned
-   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
-   generations, so seems simpler to treat existing timeline as success. However, this also 
+   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
+   generations, so seems simpler to treat existing timeline as success. However, this also
   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
   the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
   I don't think we'll observe this in practice, but can add waking up compute if needed.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
   in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number (deletion proceeds only if 
+   safe this also should be done under generation number (deletion proceeds only if
   current configuration is <= than one in request and safekeeper is not memeber of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
   jump to step 7, using it as `new_conf`.
@@ -202,47 +202,87 @@ The procedure ought to be driven from somewhere. Obvious candidates are control
 plane and storage_controller; and as each of them already has db we don't want
 yet another storage. I propose to manage safekeepers in storage_controller
 because 1) since it is in rust it simplifies simulation testing (more on this
-below) 2) it already manages pageservers. 
+below) 2) it already manages pageservers.

 This assumes that migration will be fully usable only after we migrate all
 tenants/timelines to storage_controller. It is discussible whether we want also
 to manage pageserver attachments for all of these, but likely we do.

-This requires us to define storcon <-> cplane interface.
+This requires us to define storcon <-> cplane interface and changes.

-### storage_controller <-> control plane interface
+### storage_controller <-> control plane interface and changes

 First of all, control plane should
 [change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
 storing safekeepers per timeline instead of per tenant because we can't migrate
-tenants atomically. 
+tenants atomically.

 The important question is how updated configuration is delivered from
 storage_controller to control plane to provide it to computes. As always, there
 are two options, pull and push. Let's do it the same push as with pageserver
 `/notify-attach` because 1) it keeps storage_controller out of critical compute
-start path 2) provides easier upgrade: there won't be such a thing as 'timeline
-managed by control plane / storcon', cplane just takes the value out of its db
-when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
-control plane until it succeeds.
+start path 2) uniformity. It makes storage_controller responsible for retrying
+notifying control plane until it succeeds.

-So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
-updates it in the db if the provided conf generation is higher (the cplane db
-should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
-should update db which makes the call successful, and then try to schedule
-`apply_config` if possible, it is ok if not. storage_controller 
-should rate limit calling the endpoint, but likely this won't be needed, as migration
+It is not needed for the control plane to fully know the `Configuration`. It is
+enough for it to only to be aware of the list of safekeepers in the latest
+configuration to supply it to compute, plus associated generation number to
+protect from stale update requests and to also pass it to compute.
+
+So, cplane `/notify-safekeepers` for the timeline can accept JSON like
+```
+{
+   tenant_id: String,
+   timeline_id: String,
+   generation: u32,
+   safekeepers: Vec<SafekeeperId>,
+}
+```
+where `SafekeeperId` is
+```
+{
+   node_id: u64,
+   host: String
+}
+```
+In principle `host` is redundant, but may be useful for observability.
+
+The request updates list of safekeepers in the db if the provided conf
+generation is higher (the cplane db should also store generations for this).
+Similarly to
+[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365),
+it should update db which makes the call successful, and then try to schedule
+`apply_config` if possible, it is ok if not. storage_controller should rate
+limit calling the endpoint, but likely this won't be needed, as migration
 throughput is limited by `pull_timeline`.

 Timeline (branch) creation in cplane should call storage_controller POST
 `tenant/:tenant_id/timeline` like it currently does for sharded tenants.
-Response should be augmented with `safekeeper_conf: Configuration`. The call
-should be retried until succeeds.
+Response should be augmented with `safekeepers_generation` and `safekeepers`
+fields like described in `/notify-safekeepers` above. Initially (currently)
+these fields may be absent; in this case cplane chooses safekeepers on its own
+like it currently does. The call should be retried until succeeds.

 Timeline deletion and tenant deletion in cplane should call appropriate
 storage_controller endpoints like it currently does for sharded tenants. The
 calls should be retried until they succeed.

+When compute receives safekeepers list from control plane it needs to know the
+generation to checked whether it should be updated (note that compute may get
+safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
+GUC is just a comma separates list of `host:port`. Let's prefix it with
+`g#<generation>:` to this end, so it will look like
+```
+g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
+```
+
+To summarize, list of cplane changes:
+- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field.
+- `/notify-safekeepers` endpoint.
+- Branch creation call may return list of safekeepers and when it is
+  present cplane should adopt it instead of choosing on its own like it does currently.
+- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
+
 ### storage_controller implementation

 Current 'load everything on startup and keep in memory' easy design is fine.
@@ -360,10 +400,10 @@ source safekeeper might fail, which is not a problem if we are going to
 decomission the node but leaves garbage otherwise. I'd propose in the first version
 1) Don't attempt deletion at all if node status is `offline`.
 2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
-remove garbage timelines for manual use. It will 1) list all timelines on the 
-safekeeper 2) compare each one against configuration storage: if timeline 
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
+And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
+remove garbage timelines for manual use. It will 1) list all timelines on the
+safekeeper 2) compare each one against configuration storage: if timeline
+doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
 be deleted under generation number if node is not member of current generation.

 Automating this is untrivial; we'd need to register all potential missing
@@ -412,8 +452,8 @@ There should be following layers of tests:
 3) Since simulation testing injects at relatively high level points (not
   syscalls), it omits some code, in particular `pull_timeline`. Thus it is
   better to have basic tests covering whole system as well. Extended version of
-   `test_restarts_under_load` would do: start background load and do migration 
-   under it, then restart endpoint and check that no reported commits 
+   `test_restarts_under_load` would do: start background load and do migration
+   under it, then restart endpoint and check that no reported commits
   had been lost. I'd also add one more creating classic network split scenario, with
   one compute talking to AC and another to BD while migration from nodes ABC to ABD
   happens.
@@ -422,35 +462,51 @@ There should be following layers of tests:

 ## Order of implementation and rollout

-Note that 
+Note that
 - Control plane parts and integration with it is fully independent from everything else
  (tests would use simulation and neon_local).
+- It is reasonable to make compute <-> safekeepers protocol change
+  independent of enabling generations.
 - There is a lot of infra work making storage_controller aware of timelines and safekeepers
  and its impl/rollout should be separate from migration itself.
- Initially walproposer can just stop working while it observers joint configuration.
+- Initially walproposer can just stop working while it observes joint configuration.
  Such window would be typically very short anyway.
+- Obviously we want to test the whole thing thoroughly on staging and only then
+  gradually enable in prod.

-To rollout smoothly, both walproposer and safekeeper should have flag
-`configurations_enabled`; when set to false, they would work as currently, i.e.
-walproposer is able to commit on whatever safekeeper set it is provided. Until
-all timelines are managed by storcon we'd need to use current script to migrate
-and update/drop entries in the storage_controller database if it has any.
+Let's have the following implementation bits for gradual rollout:
+- compute gets `neon.safekeepers_proto_version` flag.
+  Initially both compute and safekeepers will be able to talk both
+  versions so that we can delay force restart of them and for
+  simplicity of rollback in case it is needed.
+- storcon gets `-set-safekeepers` config option disabled by
+  default. Timeline creation request chooses safekeepers
+  (and returns them in response to cplane) only when it is set to
+  true.
+- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
+  prefixes `neon.safekeepers` GUC with generation number. When it is 0
+  (or prefix not present at all), walproposer behaves as currently, committing on
+  the provided safekeeper list -- generations are disabled.
+  If it is non 0 it follows this RFC rules.
+- We provide a script for manual migration to storage controller.
+  It selects timeline(s) from control plane (specified or all of them) db
+  and calls special import endpoint on storage controller which is very
+  similar to timeline creation: it inserts into the db, sets
+  configuration to initial on the safekeepers, calls cplane
+  `notify-safekeepers`.

-Safekeepers would need to be able to talk both current and new protocol version
-with compute to reduce number of computes restarted in prod once v2 protocol is
-deployed (though before completely switching we'd need to force this).
-
-Let's have the following rollout order:
- storage_controller becomes aware of safekeepers;
- storage_controller gets timeline creation for new timelines and deletion requests, but
-  doesn't manage all timelines yet. Migration can be tested on these new timelines.
-  To keep control plane and storage_controller databases in sync while control 
-  plane still chooses the safekeepers initially (until all timelines are imported
-  it can choose better), `TimelineCreateRequest` can get optional safekeepers
-  field with safekeepers chosen by cplane.
- Then we can import all existing timelines from control plane to
-  storage_controller and gradually enable configurations region by region.
+Then the rollout for a region would be:
+- Current situation: safekeepers are choosen by control_plane.
+- We manually migrate some timelines, test moving them around.
+- Then we enable `--set-safekeepers` so that all new timelines
+  are on storage controller.
+- Finally migrate all existing timelines using the script (no
+  compute should be speaking old proto version at this point).

+Until all timelines are managed by storcon we'd need to use current ad hoc
+script to migrate if needed. To keep state clean, all storage controller managed
+timelines must be migrated before that, or controller db and configurations
+state of safekeepers dropped manually.

 Very rough implementation order:
 - Add concept of configurations to safekeepers (including control file),
@@ -458,10 +514,10 @@ Very rough implementation order:
 - Implement walproposer changes, including protocol.
 - Implement storconn part. Use it in neon_local (and pytest).
 - Make cplane store safekeepers per timeline instead of per tenant.
- Implement cplane/storcon integration. Route branch creation/deletion 
+- Implement cplane/storcon integration. Route branch creation/deletion
  through storcon. Then we can test migration of new branches.
- Finally import existing branches. Then we can drop cplane 
-  safekeeper selection code. Gradually enable configurations at 
+- Finally import existing branches. Then we can drop cplane
+  safekeeper selection code. Gradually enable configurations at
  computes and safekeepers. Before that, all computes must talk only
  v3 protocol version.

--- a/docs/rfcs/040-profiling.md
+++ b/docs/rfcs/040-profiling.md
@@ -0,0 +1,247 @@
+# CPU and Memory Profiling
+
+Created 2025-01-12 by Erik Grinaker.
+
+See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4).
+
+## Summary
+
+This document proposes a standard cross-team pattern for CPU and memory profiling across
+applications and languages, using the [pprof](https://github.com/google/pprof) profile format.
+
+It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via
+[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
+Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations.
+
+## Motivation
+
+CPU and memory profiles are crucial observability tools for understanding performance issues,
+resource exhaustion, and resource costs. They allow answering questions like:
+
+* Why is this process using 100% CPU?
+* How do I make this go faster?
+* Why did this process run out of memory?
+* Why are we paying for all these CPU cores and memory chips?
+
+Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its
+standard library, using the [pprof](https://github.com/google/pprof) profile format and associated
+tooling.
+
+This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires
+installing and running additional tools like `perf` as root on production nodes, with analysis tools
+that can be hard to use and often don't give good results. This is not only annoying, but can also
+significantly affect the resolution time of production incidents.
+
+This proposal will:
+
+* Provide CPU and heap profiles in pprof format via HTTP API.
+* Record continuous profiles in Grafana for aggregate historical analysis.
+* Make it easy for anyone to see a flamegraph in less than one minute.
+* Be reasonably consistent across teams and services (Rust, Go, C).
+
+## Non Goals (For Now)
+
+* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/)
+  like mutexes, locks, goroutines, etc.
+* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/).
+* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization).
+
+## Using Profiles
+
+Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services:
+
+```
+$ curl localhost:9898/profile/cpu >profile.pb.gz
+```
+
+pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which
+provides flamegraphs, call graphs, plain text listings, and more:
+
+```
+$ pprof -http :6060 <profile>
+```
+
+Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly:
+
+```
+$ curl localhost:9898/profile/cpu?format=svg >profile.svg
+$ open profile.svg
+```
+
+Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles
+(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)).
+
+## API Requirements
+
+* HTTP endpoints that return a profile in pprof format (with symbols).
+  * CPU: records a profile over the request time interval (`seconds` query parameter).
+  * Memory: returns the current in-use heap allocations.
+* Unauthenticated, as it should not expose user data or pose a denial-of-service risk.
+* Default sample frequency should not impact service (maximum 5% CPU overhead).
+* Linux-compatibility.
+
+Nice to have:
+
+* Return flamegraph SVG directly from the HTTP endpoint if requested.
+* Configurable sample frequency for CPU profiles.
+* Historical heap allocations, by count and bytes.
+* macOS-compatiblity.
+
+## Rust Profiling
+
+[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs)
+contains ready-to-use HTTP endpoints for CPU and memory profiling:
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+
+### CPU
+
+CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338).
+Expose it unauthenticated at `/profile/cpu`.
+
+Parameters:
+
+* `format`: profile output format (`pprof` or `svg`; default `pprof`).
+* `seconds`: duration to collect profile over, in seconds (default `5`).
+* `frequency`: how often to sample thread stacks, in Hz (default `99`).
+* `force`: if `true`, cancel a running profile and start a new one (default `false`).
+
+Works on Linux and macOS.
+
+### Memory
+
+Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator),
+and enable profiling with samples every 2 MB allocated:
+
+```rust
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+```
+
+pprof profiles are generated by
+[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via
+[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+Expose it unauthenticated at `/profile/heap`.
+
+Parameters:
+
+* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`).
+
+Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26).
+
+## Go Profiling
+
+The Go standard library includes pprof profiling via HTTP API in
+[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at
+`/debug/pprof`.
+
+Works on Linux and macOS.
+
+### CPU 
+
+Via `/debug/pprof/profile`. Parameters:
+
+* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`).
+* `seconds`: duration to collect profile over, in seconds (default `30`).
+
+Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)),
+and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default
+is likely ok (estimated 1% overhead).
+
+### Memory
+
+Via `/debug/pprof/heap`. Parameters:
+
+* `seconds`: take a delta profile over the given duration, in seconds (default `0`).
+* `gc`: if `1`, garbage collect before taking profile.
+
+## C Profiling
+
+[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling
+with pprof output.
+
+However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value
+since we don't own the internals anyway.
+
+Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient,
+so this is not a priority at the moment.
+
+## Grafana Continuous Profiling
+
+[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles
+across the fleet, and archives them as time series. This can be used to analyze resource usage over
+time, either in aggregate or zoomed in to specific events and nodes.
+
+Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals
+is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB).
+
+It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)
+for Pageserver and Safekeeper.
+
+### Scraping
+
+* CPU profiling: 59 seconds at 19 Hz every 60 seconds.
+* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds.
+
+There are two main approaches that can be taken for CPU profiles:
+
+* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds).
+* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds).
+
+We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead
+of a spiky high overhead. It likely also gives a more representative view of resource usage.
+However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the
+actual runtime of small functions. Note that Go does not support a frequency parameter, so we must
+use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz).
+
+Only one CPU profile can be taken at a time. With continuous profiling, one will always be running.
+To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to
+cancel a running profile and start a new one.
+
+### Overhead
+
+With Rust:
+
+* CPU profiles at 19 Hz frequency: 0.1% overhead.
+* Heap profiles at 2 MB frequency: 3% allocation overhead.
+* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver).
+* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver).
+
+Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was
+11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw
+frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible
+overhead).
+
+CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal
+after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one
+of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack
+trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but
+likely 0.1% in practice (given e.g. context switches).
+
+Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the
+allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs,
+so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is 
+consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the
+fact that performance-sensitive code will avoid allocations as far as possible.
+
+Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for
+Pageserver.
+
+## Alternatives Considered
+
+* eBPF profiles.
+  * Don't require instrumenting the binary.
+  * Use less resources.
+  * Can profile in kernel space too.
+  * Supported by Grafana.
+  * Less information about stack frames and spans.
+  * Limited tooling for local analysis.
+  * Does not support heap profiles.
+  * Does not work on macOS.
+
+* [Polar Signals](https://www.polarsignals.com) instead of Grafana.
+  * We already use Grafana for everything else. Appears good enough.
--- a/docs/rfcs/041-sharded-ingest.md
+++ b/docs/rfcs/041-sharded-ingest.md
@@ -0,0 +1,255 @@
+# 
+Created on Aug 2024
+Implemented on Jan 2025
+
+## Summary
+
+Data in large tenants is split up between multiple pageservers according to key hashes, as
+introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md).
+
+Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs,
+in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives
+only the data it needs.
+
+This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth
+for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards).
+
+## Motivation
+
+1. Large databases require higher shard counts.  Whereas currently we run with up to 8 shards for tenants
+with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such
+that sending all WAL to all shards is impractical in terms of bandwidth.
+2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each
+   shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck.  To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver
+   only has to process relevant parts.
+
+## Non Goals (if relevant)
+
+We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's
+WAL across safekeepers (beyond simple 3x replication).  This RFC may be thought of as an incremental
+move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the
+pageserver, they will bottleneck on the safekeeper.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Safekeeper, pageserver.
+
+There will be no control plane or storage controller coordination needed, as pageservers will directly
+indicate their sharding parameters to the safekeeper when subscribing for WAL.
+
+## Proposed implementation
+
+Terminology:
+- "Data pages" refers to postgres relation blocks, and SLRU blocks.
+- "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and
+  directories of relations.
+
+### Phase 1: Refactor ingest
+
+Currently, pageserver ingest code is structured approximately as follows:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+   socket
+2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications
+3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when
+   its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`.
+
+This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and
+from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages
+such as relation sizes and the master DBDIR page.  It also assumes that records are ingested
+strictly one after the other: they cannot be ingested in parallel because each record assumes
+that earlier records' changes have already been applied to `Timeline`.
+
+This code will be refactored to disentangle the simple, fast decode of relation page writes
+from the more complex logic for updating internal metadata.  An intermediate representation
+called `InterpretedWalRecords` will be introduced.  This is similar to the internal state of
+a `DatadirModification`, but does not require access to a Timeline.  Instead of storing
+metadata updates as materialized writes to pages, it will accumulate these as abstract operations,
+for example rather than including a write to a relation size key, this structure will include
+an operation that indicates "Update relation _foo_'s size to the max of its current value and
+_bar_", such that these may be applied later to a real Timeline.
+
+The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates
+simple page writes of relation blocks, it can write them directly into a buffer in the serialized
+format.  This will avoid the need to later deserialize/reserialize this data when passing the
+structure between safekeeper and pageserver.
+
+The new pipeline will be:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+2. A `InterpretedWalRecords` is generated from the incoming WAL records.  This does not
+   require a reference to a Timeline.
+3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating
+   metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords`
+   and turn them into literal writes to metadata pages.  This part must be done sequentially.
+4. The resulting buffer of metadata page writes is combined with the buffer of relation block
+   writes, and written into the `InMemoryLayer`.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9472
+2. https://github.com/neondatabase/neon/pull/9504
+3. https://github.com/neondatabase/neon/pull/9524
+
+### Phase 2: Decode & filter on safekeeper
+
+In the previous phase, the ingest code was modified to be able to do most of its work without access to
+a Timeline: this first stage of ingest simply converts a series of binary wal records into
+a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes.
+
+The modified ingest code may be transplanted from pageserver to safekeeper (probably via a
+shared crate).  The safekeeper->pageserver network protocol is modified to:
+ - in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper
+ - in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`.
+ - use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for
+   the subscribing shard before transmitting it.
+
+The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of
+consistent LSN feedback, and connection management.  Only the payload of the subscriptions
+changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the
+raw data.
+
+The ingest code on the pageserver can now skip the part where it does the first phase of
+processing, as it will receive pre-processed, compressed data off the wire.
+
+Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network
+message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records
+as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards.
+Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks.
+
+The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future,
+we may migrate it to GRPC altogether
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9746
+2. https://github.com/neondatabase/neon/pull/9821
+
+### Phase 3: Fan out interpreted WAL
+
+In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still
+done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially
+when considering converting to Protobuf format and compression).
+
+To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on
+the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted
+_only_ once per (safekeeper, timeline).
+
+When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned
+for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends
+it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection
+task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader.
+There's two cases to consider:
+1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard
+will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain
+by letting shards "front-run" since compute backpressure is based on the laggard LSN.
+2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader
+gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive
+interpreted WAL records below their current position.
+
+The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time.
+If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper
+argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute
+delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers,
+so it's recommended to use large values to avoid pushing CPU utilisation too high.
+
+Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll
+the reader and sender concurrently from the connection task.
+
+Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time.
+This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start
+position.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/10190
+
+## Deployment
+
+Each phase shall be deployed independently. Special care should be taken around protocol changes.
+
+## Observability Tips
+
+* The safekeeper logs the protocol requested by the pageserver
+along with the pageserver ID, tenant, timeline and shard: `starting streaming from`.
+* There's metrics for the number of wal readers:
+  * `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK
+  * `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK
+  * `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline
+* Interesting log lines for the fan-out reader:
+  * `Spawning interpreted`: first shard creates the interpreted wal reader
+  * `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader
+  * `Aborting interpreted`: all senders have finished and the reader task is being aborted
+
+## Future Optimizations
+
+This sections describes some improvement areas which may be revisited in the future.
+
+### Buffering of Interpreted WAL
+
+The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving
+subscriptions that are lagging behind the current position of the reader.
+
+Counterpoints:
+* Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful,
+especially given that it would go unused on the happy path.
+* WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting.
+
+### Tweaking the Pagserver Safekeeper Selection Algorithm
+
+We could make the pageserver aware of which safekeeper's already host shards for the timeline along
+with their current WAL positions. The pageserver should then prefer safkeepers that are in the same
+AZ _and_ already have a shard with a position close to the desired start position.
+
+We currently run one safekeeper per AZ, so the point is mute until that changes.
+
+### Pipelining first ingest phase
+
+The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed
+output per shard.  To put multiple CPUs to work, we may pipeline this processing up to some defined buffer
+depth.
+
+## Alternatives considered
+
+### Give safekeepers enough state to fully decode WAL
+
+In this RFC, we only do the first phase of ingest on the safekeeper, because this is
+the phase that is stateless.  Subsequent changes then happen on the pageserver, with
+access to the `Timeline` state.
+
+We could do more work on the safekeeper if we transmitted metadata state to the safekeeper
+when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes,
+so that it could then generate all the metadata writes for relation sizes.
+
+We avoid doing this for several reasons:
+1. Complexity: it's a more invasive protocol change
+2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat
+   infects it with knowledge of the pageserver, but this is mainly an abstract structure
+   that describes postgres writes.  However, if we taught the safekeeper about the exact
+   way that pageserver deals with metadata keys, this would be a much tighter coupling.
+3. Load: once the WAL has been processed to the point that it can be split between shards,
+   it is preferable to share out work on the remaining shards rather than adding extra CPU
+   load to the safekeeper.
+
+### Do pre-processing on the compute instead of the safekeeper
+
+Since our first stage of ingest is stateless, it could be done at any stage in the pipeline,
+all the way up to the compute.
+
+We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather
+than just the preprocessed WAL:
+- The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication
+- It simplifies our paxos implementation to have the offset in the write log be literally
+  the same as the LSN
+- Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future.
+  Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol.
+
+### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there
+
+If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then
+we could do the initial decode and shard-splitting in some other location:
+- Shard 0 could subscribe to the full WAL and then send writes to other shards
+- A new intermediate service between the safekeeper and pageserver could do the splitting.
+
+So why not?
+- Extra network hop from shard 0 to the final destination shard
+- Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper.
+- Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate
+  disks for the safekeepers effectively have "free" CPU resources.
+- Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because
+  shard 0 would have significantly higher CPU load under write workloads than other shards.
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -15,6 +15,17 @@ pub struct GenericAPIError {
    pub error: String,
 }

+#[derive(Debug, Clone, Serialize)]
+pub struct InfoResponse {
+    pub num_cpus: usize,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct ExtensionInstallResponse {
+    pub extension: PgIdent,
+    pub version: ExtVersion,
+}
+
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
@@ -28,16 +39,6 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

-#[derive(Deserialize, Serialize)]
-#[serde(rename_all = "snake_case")]
-pub struct ComputeState {
-    pub status: ComputeStatus,
-    /// Timestamp of the last Postgres activity
-    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: Option<DateTime<Utc>>,
-    pub error: Option<String>,
-}
-
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -78,7 +79,7 @@ impl Display for ComputeStatus {
    }
 }

-fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
+pub fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
 where
    S: Serializer,
 {
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -138,6 +138,13 @@ pub struct ComputeSpec {
    /// enough spare connections for reconfiguration process to succeed.
    #[serde(default = "default_reconfigure_concurrency")]
    pub reconfigure_concurrency: usize,
+
+    /// If set to true, the compute_ctl will drop all subscriptions before starting the
+    /// compute. This is needed when we start an endpoint on a branch, so that child
+    /// would not compete with parent branch subscriptions
+    /// over the same replication content from publisher.
+    #[serde(default)] // Default false
+    pub drop_subscriptions_before_start: bool,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -120,6 +120,7 @@ pub struct ConfigToml {
    pub no_sync: Option<bool>,
    pub wal_receiver_protocol: PostgresClientProtocol,
    pub page_service_pipelining: PageServicePipeliningConfig,
+    pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -158,6 +159,25 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
    Tasks,
 }

+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case")]
+#[serde(deny_unknown_fields)]
+pub enum GetVectoredConcurrentIo {
+    /// The read path is fully sequential: layers are visited
+    /// one after the other and IOs are issued and waited upon
+    /// from the same task that traverses the layers.
+    Sequential,
+    /// The read path still traverses layers sequentially, and
+    /// index blocks will be read into the PS PageCache from
+    /// that task, with waiting.
+    /// But data IOs are dispatched and waited upon from a sidecar
+    /// task so that the traversing task can continue to traverse
+    /// layers while the IOs are in flight.
+    /// If the PS PageCache miss rate is low, this improves
+    /// throughput dramatically.
+    SidecarTask,
+}
+
 pub mod statvfs {
    pub mod mock {
        #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -234,9 +254,21 @@ pub struct TenantConfigToml {
    // Duration::ZERO means automatic compaction is disabled.
    #[serde(with = "humantime_serde")]
    pub compaction_period: Duration,
-    // Level0 delta layer threshold for compaction.
+    /// Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
    pub compaction_algorithm: crate::models::CompactionAlgorithmSettings,
+    /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure,
+    /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer
+    /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification
+    /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default.
+    pub l0_flush_delay_threshold: Option<usize>,
+    /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
+    /// to avoid deadlock. 0 to disable. Disabled by default.
+    pub l0_flush_stall_threshold: Option<usize>,
+    /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
+    /// layer. This is a temporary backpressure mechanism which should be removed once
+    /// l0_flush_{delay,stall}_threshold is fully enabled.
+    pub l0_flush_wait_upload: bool,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -301,6 +333,20 @@ pub struct TenantConfigToml {
    pub timeline_offloading: bool,

    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+
+    /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
+    /// `index_part.json`, and it cannot be reversed.
+    pub rel_size_v2_enabled: Option<bool>,
+
+    // gc-compaction related configs
+    /// Enable automatic gc-compaction trigger on this tenant.
+    pub gc_compaction_enabled: bool,
+    /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
+    /// gc-compaction will be triggered.
+    pub gc_compaction_initial_threshold_kb: u64,
+    /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
+    /// is above this ratio, gc-compaction will be triggered.
+    pub gc_compaction_ratio_percent: u64,
 }

 pub mod defaults {
@@ -450,6 +496,11 @@ impl Default for ConfigToml {
                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
                })
            },
+            get_vectored_concurrent_io: if !cfg!(test) {
+                GetVectoredConcurrentIo::Sequential
+            } else {
+                GetVectoredConcurrentIo::SidecarTask
+            },
        }
    }
 }
@@ -475,6 +526,8 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
        crate::models::CompactionAlgorithm::Legacy;

+    pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true;
+
    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
@@ -494,6 +547,9 @@ pub mod tenant_conf_defaults {
    // By default ingest enough WAL for two new L0 layers before checking if new image
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
+    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 10240000;
+    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }

 impl Default for TenantConfigToml {
@@ -510,6 +566,9 @@ impl Default for TenantConfigToml {
            compaction_algorithm: crate::models::CompactionAlgorithmSettings {
                kind: DEFAULT_COMPACTION_ALGORITHM,
            },
+            l0_flush_delay_threshold: None,
+            l0_flush_stall_threshold: None,
+            l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
@@ -538,6 +597,10 @@ impl Default for TenantConfigToml {
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: false,
            wal_receiver_protocol_override: None,
+            rel_size_v2_enabled: None,
+            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
+            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
+            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
        }
    }
 }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -87,7 +87,7 @@ impl Display for AvailabilityZone {
 #[derive(Serialize, Deserialize)]
 pub struct ShardsPreferredAzsRequest {
    #[serde(flatten)]
-    pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
+    pub preferred_az_ids: HashMap<TenantShardId, Option<AvailabilityZone>>,
 }

 #[derive(Serialize, Deserialize)]
@@ -144,6 +144,8 @@ pub struct NodeDescribeResponse {
    pub availability: NodeAvailabilityWrapper,
    pub scheduling: NodeSchedulingPolicy,

+    pub availability_zone_id: String,
+
    pub listen_http_addr: String,
    pub listen_http_port: u16,

@@ -179,7 +181,6 @@ pub struct TenantDescribeResponseShard {
 /// specifies some constraints, e.g. asking it to get off particular node(s)
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
-    pub tenant_shard_id: TenantShardId,
    pub node_id: NodeId,
 }

@@ -320,6 +321,42 @@ impl From<NodeSchedulingPolicy> for String {
    }
 }

+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+pub enum SkSchedulingPolicy {
+    Active,
+    Pause,
+    Decomissioned,
+}
+
+impl FromStr for SkSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s {
+            "active" => Self::Active,
+            "pause" => Self::Pause,
+            "decomissioned" => Self::Decomissioned,
+            _ => {
+                return Err(anyhow::anyhow!(
+                    "Unknown scheduling policy '{s}', try active,pause,decomissioned"
+                ))
+            }
+        })
+    }
+}
+
+impl From<SkSchedulingPolicy> for String {
+    fn from(value: SkSchedulingPolicy) -> String {
+        use SkSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Pause => "pause",
+            Decomissioned => "decomissioned",
+        }
+        .to_string()
+    }
+}
+
 /// Controls how tenant shards are mapped to locations on pageservers, e.g. whether
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
@@ -336,6 +373,16 @@ pub enum PlacementPolicy {
    Detached,
 }

+impl PlacementPolicy {
+    pub fn want_secondaries(&self) -> usize {
+        match self {
+            PlacementPolicy::Attached(secondary_count) => *secondary_count,
+            PlacementPolicy::Secondary => 1,
+            PlacementPolicy::Detached => 0,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateResponse {}

@@ -373,8 +420,6 @@ pub struct MetadataHealthListOutdatedResponse {
 }

 /// Publicly exposed safekeeper description
-///
-/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperDescribeResponse {
    pub id: NodeId,
@@ -387,6 +432,12 @@ pub struct SafekeeperDescribeResponse {
    pub port: i32,
    pub http_port: i32,
    pub availability_zone_id: String,
+    pub scheduling_policy: SkSchedulingPolicy,
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct SafekeeperSchedulingPolicyRequest {
+    pub scheduling_policy: SkSchedulingPolicy,
 }

 #[cfg(test)]
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,9 @@ pub struct Key {

 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
+#[derive(
+    Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
+)]
 pub struct CompactKey(i128);

 /// The storage key size.
@@ -462,6 +464,18 @@ pub fn rel_size_to_key(rel: RelTag) -> Key {
    }
 }

+#[inline(always)]
+pub fn rel_size_key_to_rel(key: Key) -> RelTag {
+    assert_eq!(key.field1, 0x00);
+    assert_eq!(key.field6, 0xffff_ffff);
+    RelTag {
+        forknum: key.field5,
+        spcnode: key.field2,
+        dbnode: key.field3,
+        relnode: key.field4,
+    }
+}
+
 impl Key {
    #[inline(always)]
    pub fn is_rel_size_key(&self) -> bool {
@@ -557,6 +571,15 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
    }
 }

+#[inline(always)]
+pub fn slru_segment_size_key_to_segno(key: Key) -> u32 {
+    assert_eq!(key.field1, 0x01);
+    assert_eq!(key.field3, 1);
+    assert_eq!(key.field5, 0);
+    assert_eq!(key.field6, 0xffff_ffff);
+    key.field4
+}
+
 impl Key {
    pub fn is_slru_segment_size_key(&self) -> bool {
        self.field1 == 0x01
@@ -706,7 +729,7 @@ pub fn repl_origin_key_range() -> Range<Key> {
 /// Non inherited range for vectored get.
 pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
 /// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
+pub const SPARSE_RANGE: Range<Key> = Key::metadata_key_range();

 impl Key {
    // AUX_FILES currently stores only data for logical replication (slots etc), and
@@ -714,7 +737,42 @@ impl Key {
    // switch (and generally it likely should be optional), so ignore these.
    #[inline(always)]
    pub fn is_inherited_key(self) -> bool {
-        !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self)
+        if self.is_sparse() {
+            self.is_inherited_sparse_key()
+        } else {
+            !NON_INHERITED_RANGE.contains(&self)
+        }
+    }
+
+    #[inline(always)]
+    pub fn is_sparse(self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
+    }
+
+    /// Check if the key belongs to the inherited keyspace.
+    fn is_inherited_sparse_key(self) -> bool {
+        debug_assert!(self.is_sparse());
+        self.field1 == RELATION_SIZE_PREFIX
+    }
+
+    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
+        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        Key {
+            field1: AUX_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REPL_ORIGIN_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
    }

    #[inline(always)]
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,11 +29,10 @@ use utils::{
 };

 use crate::{
-    key::Key,
+    key::{CompactKey, Key},
    reltag::RelTag,
    shard::{ShardCount, ShardStripeSize, TenantShardId},
 };
-use anyhow::bail;
 use bytes::{Buf, BufMut, Bytes, BytesMut};

 /// The state of a tenant in this pageserver.
@@ -272,6 +271,8 @@ pub struct CompactInfoResponse {
    pub compact_key_range: Option<CompactKeyRange>,
    pub compact_lsn_range: Option<CompactLsnRange>,
    pub sub_compaction: bool,
+    pub running: bool,
+    pub job_id: usize,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -461,6 +462,12 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_delay_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_stall_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub l0_flush_wait_upload: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_horizon: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_period: FieldPatch<String>,
@@ -496,6 +503,14 @@ pub struct TenantConfigPatch {
    pub timeline_offloading: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub rel_size_v2_enabled: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_enabled: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_ratio_percent: FieldPatch<u64>,
 }

 /// An alternative representation of `pageserver::tenant::TenantConf` with
@@ -509,6 +524,9 @@ pub struct TenantConfig {
    pub compaction_threshold: Option<usize>,
    // defer parsing compaction_algorithm, like eviction_policy
    pub compaction_algorithm: Option<CompactionAlgorithmSettings>,
+    pub l0_flush_delay_threshold: Option<usize>,
+    pub l0_flush_stall_threshold: Option<usize>,
+    pub l0_flush_wait_upload: Option<bool>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
    pub image_creation_threshold: Option<usize>,
@@ -527,6 +545,10 @@ pub struct TenantConfig {
    pub lsn_lease_length_for_ts: Option<String>,
    pub timeline_offloading: Option<bool>,
    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
+    pub rel_size_v2_enabled: Option<bool>,
+    pub gc_compaction_enabled: Option<bool>,
+    pub gc_compaction_initial_threshold_kb: Option<u64>,
+    pub gc_compaction_ratio_percent: Option<u64>,
 }

 impl TenantConfig {
@@ -538,6 +560,9 @@ impl TenantConfig {
            mut compaction_period,
            mut compaction_threshold,
            mut compaction_algorithm,
+            mut l0_flush_delay_threshold,
+            mut l0_flush_stall_threshold,
+            mut l0_flush_wait_upload,
            mut gc_horizon,
            mut gc_period,
            mut image_creation_threshold,
@@ -556,6 +581,10 @@ impl TenantConfig {
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
            mut wal_receiver_protocol_override,
+            mut rel_size_v2_enabled,
+            mut gc_compaction_enabled,
+            mut gc_compaction_initial_threshold_kb,
+            mut gc_compaction_ratio_percent,
        } = self;

        patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -566,6 +595,13 @@ impl TenantConfig {
        patch.compaction_period.apply(&mut compaction_period);
        patch.compaction_threshold.apply(&mut compaction_threshold);
        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch
+            .l0_flush_delay_threshold
+            .apply(&mut l0_flush_delay_threshold);
+        patch
+            .l0_flush_stall_threshold
+            .apply(&mut l0_flush_stall_threshold);
+        patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
        patch.gc_horizon.apply(&mut gc_horizon);
        patch.gc_period.apply(&mut gc_period);
        patch
@@ -600,6 +636,16 @@ impl TenantConfig {
        patch
            .wal_receiver_protocol_override
            .apply(&mut wal_receiver_protocol_override);
+        patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
+        patch
+            .gc_compaction_enabled
+            .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_initial_threshold_kb
+            .apply(&mut gc_compaction_initial_threshold_kb);
+        patch
+            .gc_compaction_ratio_percent
+            .apply(&mut gc_compaction_ratio_percent);

        Self {
            checkpoint_distance,
@@ -608,6 +654,9 @@ impl TenantConfig {
            compaction_period,
            compaction_threshold,
            compaction_algorithm,
+            l0_flush_delay_threshold,
+            l0_flush_stall_threshold,
+            l0_flush_wait_upload,
            gc_horizon,
            gc_period,
            image_creation_threshold,
@@ -626,6 +675,10 @@ impl TenantConfig {
            lsn_lease_length_for_ts,
            timeline_offloading,
            wal_receiver_protocol_override,
+            rel_size_v2_enabled,
+            gc_compaction_enabled,
+            gc_compaction_initial_threshold_kb,
+            gc_compaction_ratio_percent,
        }
    }
 }
@@ -968,6 +1021,13 @@ pub struct TenantConfigPatchRequest {
    pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantWaitLsnRequest {
+    #[serde(flatten)]
+    pub timelines: HashMap<TimelineId, Lsn>,
+    pub timeout: Duration,
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1398,6 +1458,8 @@ pub enum PagestreamFeMessage {
    GetPage(PagestreamGetPageRequest),
    DbSize(PagestreamDbSizeRequest),
    GetSlruSegment(PagestreamGetSlruSegmentRequest),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestRequest),
 }

 // Wrapped in libpq CopyData
@@ -1409,6 +1471,22 @@ pub enum PagestreamBeMessage {
    Error(PagestreamErrorResponse),
    DbSize(PagestreamDbSizeResponse),
    GetSlruSegment(PagestreamGetSlruSegmentResponse),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestResponse),
+}
+
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamFeMessageTag {
+    Exists = 0,
+    Nblocks = 1,
+    GetPage = 2,
+    DbSize = 3,
+    GetSlruSegment = 4,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 99,
 }

 // Keep in sync with `pagestore_client.h`
@@ -1420,7 +1498,28 @@ enum PagestreamBeMessageTag {
    Error = 103,
    DbSize = 104,
    GetSlruSegment = 105,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 199,
 }
+
+impl TryFrom<u8> for PagestreamFeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            0 => Ok(PagestreamFeMessageTag::Exists),
+            1 => Ok(PagestreamFeMessageTag::Nblocks),
+            2 => Ok(PagestreamFeMessageTag::GetPage),
+            3 => Ok(PagestreamFeMessageTag::DbSize),
+            4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            99 => Ok(PagestreamFeMessageTag::Test),
+            _ => Err(value),
+        }
+    }
+}
+
 impl TryFrom<u8> for PagestreamBeMessageTag {
    type Error = u8;
    fn try_from(value: u8) -> Result<Self, u8> {
@@ -1431,6 +1530,8 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
            103 => Ok(PagestreamBeMessageTag::Error),
            104 => Ok(PagestreamBeMessageTag::DbSize),
            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            199 => Ok(PagestreamBeMessageTag::Test),
            _ => Err(value),
        }
    }
@@ -1460,78 +1561,108 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
 // interface allows sending both LSNs, and let the pageserver do the right thing. There was no
 // difference in the responses between V1 and V2.
 //
-#[derive(Clone, Copy)]
+// V3 version of protocol adds request ID to all requests. This request ID is also included in response
+// as well as other fields from requests, which allows to verify that we receive response for our request.
+// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
+// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
+//
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum PagestreamProtocolVersion {
    V2,
+    V3,
 }

-#[derive(Debug, PartialEq, Eq)]
+pub type RequestId = u64;
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamRequest {
+    pub reqid: RequestId,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamExistsRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
    pub rel: RelTag,
 }

-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamNblocksRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
    pub rel: RelTag,
 }

-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamGetPageRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
    pub rel: RelTag,
    pub blkno: u32,
 }

-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamDbSizeRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
    pub dbnode: u32,
 }

-#[derive(Debug, PartialEq, Eq)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub hdr: PagestreamRequest,
    pub kind: u8,
    pub segno: u32,
 }

 #[derive(Debug)]
 pub struct PagestreamExistsResponse {
+    pub req: PagestreamExistsRequest,
    pub exists: bool,
 }

 #[derive(Debug)]
 pub struct PagestreamNblocksResponse {
+    pub req: PagestreamNblocksRequest,
    pub n_blocks: u32,
 }

 #[derive(Debug)]
 pub struct PagestreamGetPageResponse {
+    pub req: PagestreamGetPageRequest,
    pub page: Bytes,
 }

 #[derive(Debug)]
 pub struct PagestreamGetSlruSegmentResponse {
+    pub req: PagestreamGetSlruSegmentRequest,
    pub segment: Bytes,
 }

 #[derive(Debug)]
 pub struct PagestreamErrorResponse {
+    pub req: PagestreamRequest,
    pub message: String,
 }

 #[derive(Debug)]
 pub struct PagestreamDbSizeResponse {
+    pub req: PagestreamDbSizeRequest,
    pub db_size: i64,
 }

+#[cfg(feature = "testing")]
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PagestreamTestRequest {
+    pub hdr: PagestreamRequest,
+    pub batch_key: u64,
+    pub message: String,
+}
+
+#[cfg(feature = "testing")]
+#[derive(Debug)]
+pub struct PagestreamTestResponse {
+    pub req: PagestreamTestRequest,
+}
+
 // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
 // that require pageserver-internal types.  It is sufficient to get the total size.
 #[derive(Serialize, Deserialize, Debug)]
@@ -1545,15 +1676,16 @@ pub struct TenantHistorySize {

 impl PagestreamFeMessage {
    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 2.
+    /// tools. Always uses protocol version 3.
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

        match self {
            Self::Exists(req) => {
-                bytes.put_u8(0);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1561,9 +1693,10 @@ impl PagestreamFeMessage {
            }

            Self::Nblocks(req) => {
-                bytes.put_u8(1);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1571,9 +1704,10 @@ impl PagestreamFeMessage {
            }

            Self::GetPage(req) => {
-                bytes.put_u8(2);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1582,123 +1716,278 @@ impl PagestreamFeMessage {
            }

            Self::DbSize(req) => {
-                bytes.put_u8(3);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                bytes.put_u32(req.dbnode);
            }

            Self::GetSlruSegment(req) => {
-                bytes.put_u8(4);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
                bytes.put_u8(req.kind);
                bytes.put_u32(req.segno);
            }
+            #[cfg(feature = "testing")]
+            Self::Test(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Test as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.batch_key);
+                let message = req.message.as_bytes();
+                bytes.put_u64(message.len() as u64);
+                bytes.put_slice(message);
+            }
        }

        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;
+        let (reqid, request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                0,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V3 => (
+                body.read_u64::<BigEndian>()?,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+        };

-        // these two fields are the same for every request type
-        let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-        let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
-
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn,
-                not_modified_since,
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
+        match PagestreamFeMessageTag::try_from(msg_tag)
+            .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
+        {
+            PagestreamFeMessageTag::Exists => {
+                Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::Nblocks => {
+                Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::GetPage => {
+                Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                    blkno: body.read_u32::<BigEndian>()?,
+                }))
+            }
+            PagestreamFeMessageTag::DbSize => {
+                Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn,
-                not_modified_since,
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn,
-                not_modified_since,
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-                blkno: body.read_u32::<BigEndian>()?,
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn,
-                not_modified_since,
-                dbnode: body.read_u32::<BigEndian>()?,
-            })),
-            4 => Ok(PagestreamFeMessage::GetSlruSegment(
+                }))
+            }
+            PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
                PagestreamGetSlruSegmentRequest {
-                    request_lsn,
-                    not_modified_since,
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
                    kind: body.read_u8()?,
                    segno: body.read_u32::<BigEndian>()?,
                },
            )),
-            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
+            #[cfg(feature = "testing")]
+            PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
+                batch_key: body.read_u64::<BigEndian>()?,
+                message: {
+                    let len = body.read_u64::<BigEndian>()?;
+                    let mut buf = vec![0; len as usize];
+                    body.read_exact(&mut buf)?;
+                    String::from_utf8(buf)?
+                },
+            })),
        }
    }
 }

 impl PagestreamBeMessage {
-    pub fn serialize(&self) -> Bytes {
+    pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
        let mut bytes = BytesMut::new();

        use PagestreamBeMessageTag as Tag;
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(Tag::Exists as u8);
-                bytes.put_u8(resp.exists as u8);
-            }
+        match protocol_version {
+            PagestreamProtocolVersion::V2 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u8(resp.exists as u8);
+                    }

-            Self::Nblocks(resp) => {
-                bytes.put_u8(Tag::Nblocks as u8);
-                bytes.put_u32(resp.n_blocks);
-            }
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u32(resp.n_blocks);
+                    }

-            Self::GetPage(resp) => {
-                bytes.put_u8(Tag::GetPage as u8);
-                bytes.put(&resp.page[..]);
-            }
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put(&resp.page[..])
+                    }

-            Self::Error(resp) => {
-                bytes.put_u8(Tag::Error as u8);
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(Tag::DbSize as u8);
-                bytes.put_i64(resp.db_size);
-            }
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_i64(resp.db_size);
+                    }

-            Self::GetSlruSegment(resp) => {
-                bytes.put_u8(Tag::GetSlruSegment as u8);
-                bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                bytes.put(&resp.segment[..]);
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
+                }
+            }
+            PagestreamProtocolVersion::V3 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u8(resp.exists as u8);
+                    }
+
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.n_blocks);
+                    }
+
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.req.blkno);
+                        bytes.put(&resp.page[..])
+                    }
+
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put_u64(resp.req.reqid);
+                        bytes.put_u64(resp.req.request_lsn.0);
+                        bytes.put_u64(resp.req.not_modified_since.0);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.dbnode);
+                        bytes.put_i64(resp.db_size);
+                    }
+
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u8(resp.req.kind);
+                        bytes.put_u32(resp.req.segno);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
+                }
            }
        }
-
        bytes.into()
    }

@@ -1710,41 +1999,156 @@ impl PagestreamBeMessage {
        let ok =
            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
                Tag::Exists => {
-                    let exists = buf.read_u8()?;
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let exists = buf.read_u8()? != 0;
                    Self::Exists(PagestreamExistsResponse {
-                        exists: exists != 0,
+                        req: PagestreamExistsRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        exists,
                    })
                }
                Tag::Nblocks => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
                    let n_blocks = buf.read_u32::<BigEndian>()?;
-                    Self::Nblocks(PagestreamNblocksResponse { n_blocks })
+                    Self::Nblocks(PagestreamNblocksResponse {
+                        req: PagestreamNblocksRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        n_blocks,
+                    })
                }
                Tag::GetPage => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let blkno = buf.read_u32::<BigEndian>()?;
                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
                    buf.read_exact(&mut page)?;
-                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
+                    Self::GetPage(PagestreamGetPageResponse {
+                        req: PagestreamGetPageRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                            blkno,
+                        },
+                        page: page.into(),
+                    })
                }
                Tag::Error => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
                    let mut msg = Vec::new();
                    buf.read_until(0, &mut msg)?;
                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
                    let rust_str = cstring.to_str()?;
-                    PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    Self::Error(PagestreamErrorResponse {
+                        req: PagestreamRequest {
+                            reqid,
+                            request_lsn,
+                            not_modified_since,
+                        },
                        message: rust_str.to_owned(),
                    })
                }
                Tag::DbSize => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let dbnode = buf.read_u32::<BigEndian>()?;
                    let db_size = buf.read_i64::<BigEndian>()?;
-                    Self::DbSize(PagestreamDbSizeResponse { db_size })
+                    Self::DbSize(PagestreamDbSizeResponse {
+                        req: PagestreamDbSizeRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            dbnode,
+                        },
+                        db_size,
+                    })
                }
                Tag::GetSlruSegment => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let kind = buf.read_u8()?;
+                    let segno = buf.read_u32::<BigEndian>()?;
                    let n_blocks = buf.read_u32::<BigEndian>()?;
                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
                    buf.read_exact(&mut segment)?;
                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
+                        req: PagestreamGetSlruSegmentRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            kind,
+                            segno,
+                        },
                        segment: segment.into(),
                    })
                }
+                #[cfg(feature = "testing")]
+                Tag::Test => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let batch_key = buf.read_u64::<BigEndian>()?;
+                    let len = buf.read_u64::<BigEndian>()?;
+                    let mut msg = vec![0; len as usize];
+                    buf.read_exact(&mut msg)?;
+                    let message = String::from_utf8(msg)?;
+                    Self::Test(PagestreamTestResponse {
+                        req: PagestreamTestRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            batch_key,
+                            message,
+                        },
+                    })
+                }
            };
        let remaining = buf.into_inner();
        if !remaining.is_empty() {
@@ -1764,6 +2168,25 @@ impl PagestreamBeMessage {
            Self::Error(_) => "Error",
            Self::DbSize(_) => "DbSize",
            Self::GetSlruSegment(_) => "GetSlruSegment",
+            #[cfg(feature = "testing")]
+            Self::Test(_) => "Test",
+        }
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct PageTraceEvent {
+    pub key: CompactKey,
+    pub effective_lsn: Lsn,
+    pub time: SystemTime,
+}
+
+impl Default for PageTraceEvent {
+    fn default() -> Self {
+        Self {
+            key: Default::default(),
+            effective_lsn: Default::default(),
+            time: std::time::UNIX_EPOCH,
        }
    }
 }
@@ -1780,8 +2203,11 @@ mod tests {
        // Test serialization/deserialization of PagestreamFeMessage
        let messages = vec![
            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1790,8 +2216,11 @@ mod tests {
                },
            }),
            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(4),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(4),
+                },
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1800,8 +2229,11 @@ mod tests {
                },
            }),
            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1811,14 +2243,19 @@ mod tests {
                blkno: 7,
            }),
            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
                dbnode: 7,
            }),
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
+                    .unwrap();
            assert!(msg == reconstructed);
        }
    }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -31,6 +31,8 @@
 //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 //!   and their slugs are 0004, 0104, 0204, and 0304.

+use std::hash::{Hash, Hasher};
+
 use crate::{key::Key, models::ShardParameters};
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -48,6 +50,23 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

+/// Hash implementation
+///
+/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
+impl Hash for ShardIdentity {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ShardIdentity {
+            number,
+            count,
+            stripe_size: _,
+            layout: _,
+        } = self;
+
+        number.0.hash(state);
+        count.0.hash(state);
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -59,7 +78,7 @@ impl Default for ShardStripeSize {
 }

 /// Layout version: for future upgrades where we might change how the key->shard mapping works
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);

 const LAYOUT_V1: ShardLayout = ShardLayout(1);
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactCreate {
    pub mid: MultiXactId,
    /* new MultiXact's ID */
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactTruncate {
    pub oldest_multi_db: Oid,
    /* to-be-truncated range of multixact offsets */
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlRelmapUpdate {
    pub dbid: Oid,   /* database ID, or 0 for shared map */
    pub tsid: Oid,   /* database's tablespace, or pg_global */
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginDrop {
    pub node_id: RepOriginId,
 }
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginSet {
    pub remote_lsn: Lsn,
    pub node_id: RepOriginId,
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
 }

 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlSmgrTruncate {
    pub blkno: BlockNumber,
    pub rnode: RelFileNode,
@@ -984,7 +984,7 @@ impl XlDropDatabase {
 /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
 /// struct for commits and aborts.
 ///
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlXactParsedRecord {
    pub xid: TransactionId,
    pub info: u8,
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -44,7 +44,7 @@ pub struct ProtocolVersion(u32);

 impl ProtocolVersion {
    pub const fn new(major: u16, minor: u16) -> Self {
-        Self((major as u32) << 16 | minor as u32)
+        Self(((major as u32) << 16) | minor as u32)
    }
    pub const fn minor(self) -> u16 {
        self.0 as u16
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -377,7 +377,8 @@ impl RemoteStorage for AzureBlobStorage {

                let next_item = next_item?;

-                if timeout_try_cnt >= 2 {
+                // Log a warning if we saw two timeouts in a row before a successful request
+                if timeout_try_cnt > 2 {
                    tracing::warn!("Azure Blob Storage list timed out and succeeded after {} tries", timeout_try_cnt);
                }
                timeout_try_cnt = 1;
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -43,6 +43,17 @@ impl RemoteStorageKind {
    }
 }

+impl RemoteStorageConfig {
+    /// Helper to fetch the configured concurrency limit.
+    pub fn concurrency_limit(&self) -> Option<usize> {
+        match &self.storage {
+            RemoteStorageKind::LocalFs { .. } => None,
+            RemoteStorageKind::AwsS3(c) => Some(c.concurrency_limit.into()),
+            RemoteStorageKind::AzureContainer(c) => Some(c.concurrency_limit.into()),
+        }
+    }
+}
+
 fn default_timeout() -> Duration {
    RemoteStorageConfig::DEFAULT_TIMEOUT
 }
@@ -115,13 +126,15 @@ fn default_max_keys_per_list_response() -> Option<i32> {
 }

 fn default_azure_conn_pool_size() -> usize {
-    // Conservative default: no connection pooling.  At time of writing this is the Azure
-    // SDK's default as well, due to historic reports of hard-to-reproduce issues
+    // By default, the Azure SDK does no connection pooling, due to historic reports of hard-to-reproduce issues
    // (https://github.com/hyperium/hyper/issues/2312)
    //
    // However, using connection pooling is important to avoid exhausting client ports when
    // doing huge numbers of requests (https://github.com/neondatabase/cloud/issues/20971)
-    0
+    //
+    // We therefore enable a modest pool size by default: this may be configured to zero if
+    // issues like the alleged upstream hyper issue appear.
+    8
 }

 impl Debug for S3Config {
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -5,9 +5,12 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+anyhow.workspace = true
 const_format.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
 utils.workspace = true
+pageserver_api.workspace = true
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -4,12 +4,15 @@ use const_format::formatcp;
 use pq_proto::SystemId;
 use serde::{Deserialize, Serialize};

+pub mod membership;
 /// Public API types
 pub mod models;

 /// Consensus logical timestamp. Note: it is a part of sk control file.
 pub type Term = u64;
-pub const INVALID_TERM: Term = 0;
+/// With this term timeline is created initially. It
+/// is a normal term except wp is never elected with it.
+pub const INITIAL_TERM: Term = 0;

 /// Information about Postgres. Safekeeper gets it once and then verifies all
 /// further connections from computes match. Note: it is a part of sk control
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -0,0 +1,166 @@
+//! Types defining safekeeper membership, see
+//! rfcs/035-safekeeper-dynamic-membership-change.md
+//! for details.
+
+use std::{collections::HashSet, fmt::Display};
+
+use anyhow;
+use anyhow::bail;
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+/// Number uniquely identifying safekeeper configuration.
+/// Note: it is a part of sk control file.
+pub type Generation = u32;
+/// 1 is the first valid generation, 0 is used as
+/// a placeholder before we fully migrate to generations.
+pub const INVALID_GENERATION: Generation = 0;
+pub const INITIAL_GENERATION: Generation = 1;
+
+/// Membership is defined by ids so e.g. walproposer uses them to figure out
+/// quorums, but we also carry host and port to give wp idea where to connect.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafekeeperId {
+    pub id: NodeId,
+    pub host: String,
+    /// We include here only port for computes -- that is, pg protocol tenant
+    /// only port, or wide pg protocol port if the former is not configured.
+    pub pg_port: u16,
+}
+
+impl Display for SafekeeperId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
+    }
+}
+
+/// Set of safekeepers.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(transparent)]
+pub struct MemberSet {
+    pub members: Vec<SafekeeperId>,
+}
+
+impl MemberSet {
+    pub fn empty() -> Self {
+        MemberSet {
+            members: Vec::new(),
+        }
+    }
+
+    pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
+        let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
+        if hs.len() != members.len() {
+            bail!("duplicate safekeeper id in the set {:?}", members);
+        }
+        Ok(MemberSet { members })
+    }
+
+    pub fn contains(&self, sk: &SafekeeperId) -> bool {
+        self.members.iter().any(|m| m.id == sk.id)
+    }
+
+    pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
+        if self.contains(&sk) {
+            bail!(format!(
+                "sk {} is already member of the set {}",
+                sk.id, self
+            ));
+        }
+        self.members.push(sk);
+        Ok(())
+    }
+}
+
+impl Display for MemberSet {
+    /// Display as a comma separated list of members.
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let sks_str = self
+            .members
+            .iter()
+            .map(|m| m.to_string())
+            .collect::<Vec<_>>();
+        write!(f, "({})", sks_str.join(", "))
+    }
+}
+
+/// Safekeeper membership configuration.
+/// Note: it is a part of both control file and http API.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct Configuration {
+    /// Unique id.
+    pub generation: Generation,
+    /// Current members of the configuration.
+    pub members: MemberSet,
+    /// Some means it is a joint conf.
+    pub new_members: Option<MemberSet>,
+}
+
+impl Configuration {
+    /// Used for pre-generations timelines, will be removed eventually.
+    pub fn empty() -> Self {
+        Configuration {
+            generation: INVALID_GENERATION,
+            members: MemberSet::empty(),
+            new_members: None,
+        }
+    }
+}
+
+impl Display for Configuration {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "gen={}, members={}, new_members={}",
+            self.generation,
+            self.members,
+            self.new_members
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or(String::from("none"))
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{MemberSet, SafekeeperId};
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_member_set() {
+        let mut members = MemberSet::empty();
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .expect_err("duplicate must not be allowed");
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(43),
+                host: String::from("bubu.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        println!("members: {}", members);
+
+        let j = serde_json::to_string(&members).expect("failed to serialize");
+        println!("members json: {}", j);
+        assert_eq!(
+            j,
+            r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
+        );
+    }
+}
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,5 +1,6 @@
 //! Types used in safekeeper http API. Many of them are also reused internally.

+use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::TimestampTz;
 use serde::{Deserialize, Serialize};
 use std::net::SocketAddr;
@@ -11,7 +12,7 @@ use utils::{
    pageserver_feedback::PageserverFeedback,
 };

-use crate::{ServerInfo, Term};
+use crate::{membership::Configuration, ServerInfo, Term};

 #[derive(Debug, Serialize)]
 pub struct SafekeeperStatus {
@@ -22,13 +23,16 @@ pub struct SafekeeperStatus {
 pub struct TimelineCreateRequest {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    pub peer_ids: Option<Vec<NodeId>>,
+    pub mconf: Configuration,
    pub pg_version: u32,
    pub system_id: Option<u64>,
+    // By default WAL_SEGMENT_SIZE
    pub wal_seg_size: Option<u32>,
-    pub commit_lsn: Lsn,
-    // If not passed, it is assigned to the beginning of commit_lsn segment.
-    pub local_start_lsn: Option<Lsn>,
+    pub start_lsn: Lsn,
+    // Normal creation should omit this field (start_lsn initializes all LSNs).
+    // However, we allow specifying custom value higher than start_lsn for
+    // manual recovery case, see test_s3_wal_replay.
+    pub commit_lsn: Option<Lsn>,
 }

 /// Same as TermLsn, but serializes LSN using display serializer
@@ -143,7 +147,13 @@ pub type ConnectionId = u32;

 /// Serialize is used only for json'ing in API response. Also used internally.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalSenderState {
+pub enum WalSenderState {
+    Vanilla(VanillaWalSenderState),
+    Interpreted(InterpretedWalSenderState),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct VanillaWalSenderState {
    pub ttid: TenantTimelineId,
    pub addr: SocketAddr,
    pub conn_id: ConnectionId,
@@ -152,6 +162,17 @@ pub struct WalSenderState {
    pub feedback: ReplicationFeedback,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InterpretedWalSenderState {
+    pub ttid: TenantTimelineId,
+    pub shard: ShardIdentity,
+    pub addr: SocketAddr,
+    pub conn_id: ConnectionId,
+    // postgres application_name
+    pub appname: Option<String>,
+    pub feedback: ReplicationFeedback,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalReceiverState {
    /// None means it is recovery initiated by us (this safekeeper).
@@ -172,6 +193,7 @@ pub enum WalReceiverStatus {
 pub struct TimelineStatus {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
+    pub mconf: Configuration,
    pub acceptor_state: AcceptorStateStatus,
    pub pg_info: ServerInfo,
    pub flush_lsn: Lsn,
@@ -186,6 +208,20 @@ pub struct TimelineStatus {
    pub walreceivers: Vec<WalReceiverState>,
 }

+/// Request to switch membership configuration.
+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct TimelineMembershipSwitchRequest {
+    pub mconf: Configuration,
+}
+
+/// In response both previous and current configuration are sent.
+#[derive(Serialize, Deserialize)]
+pub struct TimelineMembershipSwitchResponse {
+    pub previous_conf: Configuration,
+    pub current_conf: Configuration,
+}
+
 fn lsn_invalid() -> Lsn {
    Lsn::INVALID
 }
@@ -241,3 +277,8 @@ pub struct TimelineTermBumpResponse {
    pub previous_term: u64,
    pub current_term: u64,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct SafekeeperUtilization {
+    pub timeline_count: u64,
+}
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -38,7 +38,6 @@ pub mod http;

 use opentelemetry::trace::TracerProvider;
 use opentelemetry::KeyValue;
-use opentelemetry_sdk::Resource;
 use tracing::Subscriber;
 use tracing_subscriber::registry::LookupSpan;
 use tracing_subscriber::Layer;
@@ -121,7 +120,10 @@ where
    S: Subscriber + for<'span> LookupSpan<'span>,
 {
    // Sets up exporter from the OTEL_EXPORTER_* environment variables.
-    let exporter = opentelemetry_otlp::new_exporter().http();
+    let exporter = opentelemetry_otlp::SpanExporter::builder()
+        .with_http()
+        .build()
+        .expect("could not initialize opentelemetry exporter");

    // TODO: opentelemetry::global::set_error_handler() with custom handler that
    //       bypasses default tracing layers, but logs regular looking log
@@ -132,17 +134,13 @@ where
        opentelemetry_sdk::propagation::TraceContextPropagator::new(),
    );

-    let tracer = opentelemetry_otlp::new_pipeline()
-        .tracing()
-        .with_exporter(exporter)
-        .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
-            Resource::new(vec![KeyValue::new(
-                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
-                service_name,
-            )]),
-        ))
-        .install_batch(opentelemetry_sdk::runtime::Tokio)
-        .expect("could not initialize opentelemetry exporter")
+    let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
+        .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
+        .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
+            opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+            service_name,
+        )]))
+        .build()
        .tracer("global");

    tracing_opentelemetry::layer().with_tracer(tracer)
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+inferno.workspace = true
 itertools.workspace = true
 fail.workspace = true
 futures = { workspace = true }
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -2,6 +2,7 @@

 use std::{fmt::Display, str::FromStr};

+/// For types `V` that implement [`FromStr`].
 pub fn var<V, E>(varname: &str) -> Option<V>
 where
    V: FromStr<Err = E>,
@@ -10,7 +11,9 @@ where
    match std::env::var(varname) {
        Ok(s) => Some(
            s.parse()
-                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
+                .map_err(|e| {
+                    format!("failed to parse env var {varname} using FromStr::parse: {e:#}")
+                })
                .unwrap(),
        ),
        Err(std::env::VarError::NotPresent) => None,
@@ -19,3 +22,24 @@ where
        }
    }
 }
+
+/// For types `V` that implement [`serde::de::DeserializeOwned`].
+pub fn var_serde_json_string<V>(varname: &str) -> Option<V>
+where
+    V: serde::de::DeserializeOwned,
+{
+    match std::env::var(varname) {
+        Ok(s) => Some({
+            let value = serde_json::Value::String(s);
+            serde_json::from_value(value)
+                .map_err(|e| {
+                    format!("failed to parse env var {varname} as a serde_json json string: {e:#}")
+                })
+                .unwrap()
+        }),
+        Err(std::env::VarError::NotPresent) => None,
+        Err(std::env::VarError::NotUnicode(_)) => {
+            panic!("env var {varname} is not unicode")
+        }
+    }
+}
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -11,31 +11,55 @@ use tracing::*;

 /// Declare a failpoint that can use to `pause` failpoint action.
 /// We don't want to block the executor thread, hence, spawn_blocking + await.
+///
+/// Optionally pass a cancellation token, and this failpoint will drop out of
+/// its pause when the cancellation token fires. This is useful for testing
+/// cases where we would like to block something, but test its clean shutdown behavior.
+/// The macro evaluates to a Result in that case, where Ok(()) is the case
+/// where the failpoint was not paused, and Err() is the case where cancellation
+/// token fired while evaluating the failpoint.
+///
+/// Remember to unpause the failpoint in the test; until that happens, one of the
+/// limited number of spawn_blocking thread pool threads is leaked.
 #[macro_export]
 macro_rules! pausable_failpoint {
-    ($name:literal) => {
+    ($name:literal) => {{
        if cfg!(feature = "testing") {
-            tokio::task::spawn_blocking({
-                let current = tracing::Span::current();
+            let cancel = ::tokio_util::sync::CancellationToken::new();
+            let _ = $crate::pausable_failpoint!($name, &cancel);
+        }
+    }};
+    ($name:literal, $cancel:expr) => {{
+        if cfg!(feature = "testing") {
+            let failpoint_fut = ::tokio::task::spawn_blocking({
+                let current = ::tracing::Span::current();
                move || {
                    let _entered = current.entered();
-                    tracing::info!("at failpoint {}", $name);
-                    fail::fail_point!($name);
+                    ::tracing::info!("at failpoint {}", $name);
+                    ::fail::fail_point!($name);
+                }
+            });
+            let cancel_fut = async move {
+                $cancel.cancelled().await;
+            };
+            ::tokio::select! {
+                res = failpoint_fut => {
+                    res.expect("spawn_blocking");
+                    // continue with execution
+                    Ok(())
+                },
+                _ = cancel_fut => {
+                    Err(())
                }
-            })
-            .await
-            .expect("spawn_blocking");
-        }
-    };
-    ($name:literal, $cond:expr) => {
-        if cfg!(feature = "testing") {
-            if $cond {
-                pausable_failpoint!($name)
            }
+        } else {
+            Ok(())
        }
-    };
+    }};
 }

+pub use pausable_failpoint;
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -112,9 +112,9 @@ impl Serialize for Generation {
            // We should never be asked to serialize a None. Structures
            // that include an optional generation should convert None to an
            // Option<Generation>::None
-            Err(serde::ser::Error::custom(
-                "Tried to serialize invalid generation ({self})",
-            ))
+            Err(serde::ser::Error::custom(format!(
+                "Tried to serialize invalid generation ({self:?})"
+            )))
        }
    }
 }
--- a/libs/utils/src/guard_arc_swap.rs
+++ b/libs/utils/src/guard_arc_swap.rs
@@ -0,0 +1,54 @@
+//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
+//! don't block reads.
+
+use arc_swap::ArcSwap;
+use std::sync::Arc;
+use tokio::sync::TryLockError;
+
+pub struct GuardArcSwap<T> {
+    inner: ArcSwap<T>,
+    guard: tokio::sync::Mutex<()>,
+}
+
+pub struct Guard<'a, T> {
+    _guard: tokio::sync::MutexGuard<'a, ()>,
+    inner: &'a ArcSwap<T>,
+}
+
+impl<T> GuardArcSwap<T> {
+    pub fn new(inner: T) -> Self {
+        Self {
+            inner: ArcSwap::new(Arc::new(inner)),
+            guard: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn read(&self) -> Arc<T> {
+        self.inner.load_full()
+    }
+
+    pub async fn write_guard(&self) -> Guard<'_, T> {
+        Guard {
+            _guard: self.guard.lock().await,
+            inner: &self.inner,
+        }
+    }
+
+    pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
+        let guard = self.guard.try_lock()?;
+        Ok(Guard {
+            _guard: guard,
+            inner: &self.inner,
+        })
+    }
+}
+
+impl<T> Guard<'_, T> {
+    pub fn read(&self) -> Arc<T> {
+        self.inner.load_full()
+    }
+
+    pub fn write(&mut self, value: T) {
+        self.inner.store(Arc::new(value));
+    }
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -15,7 +15,7 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
-use tokio::sync::{mpsc, Mutex};
+use tokio::sync::{mpsc, Mutex, Notify};
 use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
@@ -350,33 +350,53 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
    };
    let seconds = match parse_query_param(&req, "seconds")? {
        None => 5,
-        Some(seconds @ 1..=30) => seconds,
-        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-30 secs"))),
+        Some(seconds @ 1..=60) => seconds,
+        Some(_) => return Err(ApiError::BadRequest(anyhow!("duration must be 1-60 secs"))),
    };
    let frequency_hz = match parse_query_param(&req, "frequency")? {
        None => 99,
        Some(1001..) => return Err(ApiError::BadRequest(anyhow!("frequency must be <=1000 Hz"))),
        Some(frequency) => frequency,
    };
-
-    // Only allow one profiler at a time.
-    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
-    let _lock = PROFILE_LOCK
-        .try_lock()
-        .map_err(|_| ApiError::Conflict("profiler already running".into()))?;
+    let force: bool = parse_query_param(&req, "force")?.unwrap_or_default();

    // Take the profile.
-    let report = tokio::task::spawn_blocking(move || {
+    static PROFILE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+    static PROFILE_CANCEL: Lazy<Notify> = Lazy::new(Notify::new);
+
+    let report = {
+        // Only allow one profiler at a time. If force is true, cancel a running profile (e.g. a
+        // Grafana continuous profile). We use a try_lock() loop when cancelling instead of waiting
+        // for a lock(), to avoid races where the notify isn't currently awaited.
+        let _lock = loop {
+            match PROFILE_LOCK.try_lock() {
+                Ok(lock) => break lock,
+                Err(_) if force => PROFILE_CANCEL.notify_waiters(),
+                Err(_) => {
+                    return Err(ApiError::Conflict(
+                        "profiler already running (use ?force=true to cancel it)".into(),
+                    ))
+                }
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await; // don't busy-wait
+        };
+
        let guard = ProfilerGuardBuilder::default()
            .frequency(frequency_hz)
            .blocklist(&["libc", "libgcc", "pthread", "vdso"])
-            .build()?;
-        std::thread::sleep(Duration::from_secs(seconds));
-        guard.report().build()
-    })
-    .await
-    .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-    .map_err(|pprof_err| ApiError::InternalServerError(pprof_err.into()))?;
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?;
+
+        tokio::select! {
+            _ = tokio::time::sleep(Duration::from_secs(seconds)) => {},
+            _ = PROFILE_CANCEL.notified() => {},
+        };
+
+        guard
+            .report()
+            .build()
+            .map_err(|err| ApiError::InternalServerError(err.into()))?
+    };

    // Return the report in the requested format.
    match format {
@@ -417,6 +437,7 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
    enum Format {
        Jemalloc,
        Pprof,
+        Svg,
    }

    // Parameters.
@@ -424,9 +445,24 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
        None => Format::Pprof,
        Some("jemalloc") => Format::Jemalloc,
        Some("pprof") => Format::Pprof,
+        Some("svg") => Format::Svg,
        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
    };

+    // Functions and mappings to strip when symbolizing pprof profiles. If true,
+    // also remove child frames.
+    static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
+        vec![
+            (Regex::new("^__rust").unwrap(), false),
+            (Regex::new("^_start$").unwrap(), false),
+            (Regex::new("^irallocx_prof").unwrap(), true),
+            (Regex::new("^prof_alloc_prep").unwrap(), true),
+            (Regex::new("^std::rt::lang_start").unwrap(), false),
+            (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
+        ]
+    });
+    const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
+
    // Obtain profiler handle.
    let mut prof_ctl = jemalloc_pprof::PROF_CTL
        .as_ref()
@@ -464,24 +500,9 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                // Symbolize the profile.
                // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
                // serialization roundtrip.
-                static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
-                    // Functions to strip from profiles. If true, also remove child frames.
-                    vec![
-                        (Regex::new("^__rust").unwrap(), false),
-                        (Regex::new("^_start$").unwrap(), false),
-                        (Regex::new("^irallocx_prof").unwrap(), true),
-                        (Regex::new("^prof_alloc_prep").unwrap(), true),
-                        (Regex::new("^std::rt::lang_start").unwrap(), false),
-                        (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
-                    ]
-                });
                let profile = pprof::decode(&bytes)?;
                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(
-                    profile,
-                    &["libc", "libgcc", "pthread", "vdso"],
-                    &STRIP_FUNCTIONS,
-                );
+                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
                pprof::encode(&profile)
            })
            .await
@@ -494,6 +515,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
                .body(Body::from(data))
                .map_err(|err| ApiError::InternalServerError(err.into()))
        }
+
+        Format::Svg => {
+            let body = tokio::task::spawn_blocking(move || {
+                let bytes = prof_ctl.dump_pprof()?;
+                let profile = pprof::decode(&bytes)?;
+                let profile = pprof::symbolize(profile)?;
+                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
+                let mut opts = inferno::flamegraph::Options::default();
+                opts.title = "Heap inuse".to_string();
+                opts.count_name = "bytes".to_string();
+                pprof::flamegraph(profile, &mut opts)
+            })
+            .await
+            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+            .map_err(ApiError::InternalServerError)?;
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "image/svg+xml")
+                .body(Body::from(body))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
    }
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -98,6 +98,8 @@ pub mod try_rcu;

 pub mod pprof;

+pub mod guard_arc_swap;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -260,7 +260,7 @@ impl FromStr for Lsn {
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
            let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
-            Ok(Lsn((left_num as u64) << 32 | right_num as u64))
+            Ok(Lsn(((left_num as u64) << 32) | right_num as u64))
        } else {
            Err(LsnParseError)
        }
--- a/libs/utils/src/pprof.rs
+++ b/libs/utils/src/pprof.rs
@@ -1,8 +1,9 @@
+use anyhow::bail;
 use flate2::write::{GzDecoder, GzEncoder};
 use flate2::Compression;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pprof::protos::{Function, Line, Message as _, Profile};
+use pprof::protos::{Function, Line, Location, Message as _, Profile};
 use regex::Regex;

 use std::borrow::Cow;
@@ -188,3 +189,59 @@ pub fn strip_locations(

    profile
 }
+
+/// Generates an SVG flamegraph from a symbolized pprof profile.
+pub fn flamegraph(
+    profile: Profile,
+    opts: &mut inferno::flamegraph::Options,
+) -> anyhow::Result<Vec<u8>> {
+    if profile.mapping.iter().any(|m| !m.has_functions) {
+        bail!("profile not symbolized");
+    }
+
+    // Index locations, functions, and strings.
+    let locations: HashMap<u64, Location> =
+        profile.location.into_iter().map(|l| (l.id, l)).collect();
+    let functions: HashMap<u64, Function> =
+        profile.function.into_iter().map(|f| (f.id, f)).collect();
+    let strings = profile.string_table;
+
+    // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
+    // since inferno expects it bottom-up.
+    let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
+    for sample in profile.sample {
+        let mut stack = Vec::with_capacity(sample.location_id.len());
+        for location in sample.location_id.into_iter().rev() {
+            let Some(location) = locations.get(&location) else {
+                bail!("missing location {location}");
+            };
+            for line in location.line.iter().rev() {
+                let Some(function) = functions.get(&line.function_id) else {
+                    bail!("missing function {}", line.function_id);
+                };
+                let Some(name) = strings.get(function.name as usize) else {
+                    bail!("missing string {}", function.name);
+                };
+                stack.push(name.as_str());
+            }
+        }
+        let Some(&value) = sample.value.first() else {
+            bail!("missing value");
+        };
+        *stacks.entry(stack).or_default() += value;
+    }
+
+    // Construct stack lines for inferno.
+    let lines = stacks
+        .into_iter()
+        .map(|(stack, value)| (stack.into_iter().join(";"), value))
+        .map(|(stack, value)| format!("{stack} {value}"))
+        .sorted()
+        .collect_vec();
+
+    // Construct the flamegraph.
+    let mut bytes = Vec::new();
+    let lines = lines.iter().map(|line| line.as_str());
+    inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
+    Ok(bytes)
+}
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -64,6 +64,12 @@ pub struct GateGuard {
    gate: Arc<GateInner>,
 }

+impl GateGuard {
+    pub fn try_clone(&self) -> Result<Self, GateError> {
+        Gate::enter_impl(self.gate.clone())
+    }
+}
+
 impl Drop for GateGuard {
    fn drop(&mut self) {
        if self.gate.closing.load(Ordering::Relaxed) {
@@ -107,11 +113,11 @@ impl Gate {
    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
    /// also contain a CancellationToken.
    pub fn enter(&self) -> Result<GateGuard, GateError> {
-        let permit = self
-            .inner
-            .sem
-            .try_acquire()
-            .map_err(|_| GateError::GateClosed)?;
+        Self::enter_impl(self.inner.clone())
+    }
+
+    fn enter_impl(gate: Arc<GateInner>) -> Result<GateGuard, GateError> {
+        let permit = gate.sem.try_acquire().map_err(|_| GateError::GateClosed)?;

        // we now have the permit, let's disable the normal raii functionality and leave
        // "returning" the permit to our GateGuard::drop.
@@ -122,7 +128,7 @@ impl Gate {

        Ok(GateGuard {
            span_at_enter: tracing::Span::current(),
-            gate: self.inner.clone(),
+            gate,
        })
    }

@@ -252,4 +258,39 @@ mod tests {
        // Attempting to enter() is still forbidden
        gate.enter().expect_err("enter should fail finishing close");
    }
+
+    #[tokio::test(start_paused = true)]
+    async fn clone_gate_guard() {
+        let gate = Gate::default();
+        let forever = Duration::from_secs(24 * 7 * 365);
+
+        let guard1 = gate.enter().expect("gate isn't closed");
+
+        let guard2 = guard1.try_clone().expect("gate isn't clsoed");
+
+        let mut close_fut = std::pin::pin!(gate.close());
+
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
+
+        // we polled close_fut once, that should prevent all later enters and clones
+        gate.enter().unwrap_err();
+        guard1.try_clone().unwrap_err();
+        guard2.try_clone().unwrap_err();
+
+        // guard2 keeps gate open even if guard1 is closed
+        drop(guard1);
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();
+
+        drop(guard2);
+
+        // now that the last guard is dropped, closing should complete
+        close_fut.await;
+
+        // entering is still forbidden
+        gate.enter().expect_err("enter should stilll fail");
+    }
 }
--- a/libs/utils/src/sync/spsc_fold.rs
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -96,7 +96,11 @@ impl<T: Send> Sender<T> {
                    }
                }
                State::SenderWaitsForReceiverToConsume(_data) => {
-                    // Really, we shouldn't be polled until receiver has consumed and wakes us.
+                    // SAFETY: send is single threaded due to `&mut self` requirement,
+                    // therefore register is not concurrent.
+                    unsafe {
+                        self.state.wake_sender.register(cx.waker());
+                    }
                    Poll::Pending
                }
                State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
@@ -449,4 +453,38 @@ mod tests {
        let err = recv_task.await.unwrap().expect_err("should error");
        assert!(matches!(err, RecvError::SenderGone));
    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_drop_while_waiting_for_receiver_to_consume_unblocks_sender() {
+        let (mut sender, receiver) = channel();
+
+        let state = receiver.state.clone();
+
+        sender.send((), |_, _| unreachable!()).await.unwrap();
+
+        assert!(matches!(&*state.value.lock().unwrap(), &State::HasData(_)));
+
+        let unmergeable = sender.send((), |_, _| Err(()));
+        let mut unmergeable = std::pin::pin!(unmergeable);
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {},
+            _ = &mut unmergeable => {
+                panic!("unmergeable should not complete");
+            },
+        }
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::SenderWaitsForReceiverToConsume(_)
+        ));
+
+        drop(receiver);
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::ReceiverGone
+        ));
+
+        unmergeable.await.unwrap_err();
+    }
 }
--- a/Show More
+++ b/Show More