Documentation and tweaks

Add stream pool
Add initial client pool
2026-01-22 21:02:56 +00:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00
431 changed files with 25659 additions and 5655 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,6 +4,7 @@
 !Cargo.lock
 !Cargo.toml
 !Makefile
+!postgres.mk
 !rust-toolchain.toml
 !scripts/ninstall.sh
 !docker-compose/run-tests.sh
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -38,6 +38,11 @@ on:
        required: false
        default: 1
        type: number
+      rerun-failed:
+        description: 'rerun failed tests to ignore flaky tests'
+        required: false
+        default: true
+        type: boolean

 defaults:
  run:
@@ -99,11 +104,10 @@ jobs:

      # Set some environment variables used by all the steps.
      #
-      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
-      #   It also includes --features, if any
+      # CARGO_FLAGS is extra options to pass to all "cargo" subcommands.
      #
-      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
-      #   because "cargo metadata" doesn't accept --release or --debug options
+      # CARGO_PROFILE is passed to "cargo build", "cargo test" etc, but not to
+      #   "cargo metadata", because it doesn't accept --release or --debug options.
      #
      # We run tests with addtional features, that are turned off by default (e.g. in release builds), see
      # corresponding Cargo.toml files for their descriptions.
@@ -112,16 +116,16 @@ jobs:
          ARCH: ${{ inputs.arch }}
          SANITIZERS: ${{ inputs.sanitizers }}
        run: |
-          CARGO_FEATURES="--features testing"
+          CARGO_FLAGS="--locked --features testing"
          if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
-            CARGO_FLAGS="--locked"
+            CARGO_PROFILE=""
          elif [[ $BUILD_TYPE == "debug" ]]; then
            cov_prefix=""
-            CARGO_FLAGS="--locked"
+            CARGO_PROFILE=""
          elif [[ $BUILD_TYPE == "release" ]]; then
            cov_prefix=""
-            CARGO_FLAGS="--locked --release"
+            CARGO_PROFILE="--release"
          fi
          if [[ $SANITIZERS == 'enabled' ]]; then
            make_vars="WITH_SANITIZERS=yes"
@@ -131,8 +135,8 @@ jobs:
          {
            echo "cov_prefix=${cov_prefix}"
            echo "make_vars=${make_vars}"
-            echo "CARGO_FEATURES=${CARGO_FEATURES}"
            echo "CARGO_FLAGS=${CARGO_FLAGS}"
+            echo "CARGO_PROFILE=${CARGO_PROFILE}"
            echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
          } >> $GITHUB_ENV

@@ -184,34 +188,18 @@ jobs:
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

-      - name: Build postgres v14
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
-        run: mold -run make ${make_vars} postgres-v14 -j$(nproc)
-
-      - name: Build postgres v15
-        if: steps.cache_pg_15.outputs.cache-hit != 'true'
-        run: mold -run make ${make_vars} postgres-v15 -j$(nproc)
-
-      - name: Build postgres v16
-        if: steps.cache_pg_16.outputs.cache-hit != 'true'
-        run: mold -run make ${make_vars} postgres-v16 -j$(nproc)
-
-      - name: Build postgres v17
-        if: steps.cache_pg_17.outputs.cache-hit != 'true'
-        run: mold -run make ${make_vars} postgres-v17 -j$(nproc)
-
-      - name: Build neon extensions
-        run: mold -run make ${make_vars} neon-pg-ext -j$(nproc)
+      - name: Build all
+        # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
+        run: mold -run make ${make_vars} all -j$(nproc) CARGO_BUILD_FLAGS="$CARGO_FLAGS"

      - name: Build walproposer-lib
        run: mold -run make ${make_vars} walproposer-lib -j$(nproc)

-      - name: Run cargo build
-        env:
-          WITH_TESTS: ${{ inputs.sanitizers != 'enabled' && '--tests' || '' }}
+      - name: Build unit tests
+        if: inputs.sanitizers != 'enabled'
        run: |
          export ASAN_OPTIONS=detect_leaks=0
-          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins ${WITH_TESTS}
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_PROFILE --tests

      # Do install *before* running rust tests because they might recompile the
      # binaries with different features/flags.
@@ -223,7 +211,7 @@ jobs:
          # Install target binaries
          mkdir -p /tmp/neon/bin/
          binaries=$(
-            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
+            ${cov_prefix} cargo metadata $CARGO_FLAGS --format-version=1 --no-deps |
            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
          )
          for bin in $binaries; do
@@ -240,7 +228,7 @@ jobs:
            mkdir -p /tmp/neon/test_bin/

            test_exe_paths=$(
-              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
+              ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_PROFILE --message-format=json --no-run |
              jq -r '.executable | select(. != null)'
            )
            for bin in $test_exe_paths; do
@@ -274,10 +262,10 @@ jobs:
          export LD_LIBRARY_PATH

          #nextest does not yet support running doctests
-          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+          ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_PROFILE

          # run all non-pageserver tests
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E '!package(pageserver)'

          # run pageserver tests
          # (When developing new pageserver features gated by config fields, we commonly make the rust
@@ -286,13 +274,13 @@ jobs:
          # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.)
          NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring  \
          ${cov_prefix} \
-          cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+          cargo nextest run $CARGO_FLAGS $CARGO_PROFILE  -E 'package(pageserver)'

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_s3)'

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
@@ -301,17 +289,17 @@ jobs:
          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_PROFILE -E 'package(remote_storage)' -E 'test(test_real_azure)'

      - name: Install postgres binaries
        run: |
          # Use tar to copy files matching the pattern, preserving the paths in the destionation
          tar c \
            pg_install/v* \
-            pg_install/build/*/src/test/regress/*.so \
-            pg_install/build/*/src/test/regress/pg_regress \
-            pg_install/build/*/src/test/isolation/isolationtester \
-            pg_install/build/*/src/test/isolation/pg_isolation_regress \
+            build/*/src/test/regress/*.so \
+            build/*/src/test/regress/pg_regress \
+            build/*/src/test/isolation/isolationtester \
+            build/*/src/test/isolation/pg_isolation_regress \
            | tar  x -C /tmp/neon

      - name: Upload Neon artifact
@@ -379,7 +367,7 @@ jobs:
      - name: Pytest regression tests
        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }}
+        timeout-minutes: ${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 75 || 180 }}
        with:
          build_type: ${{ inputs.build-type }}
          test_selection: regress
@@ -387,14 +375,14 @@ jobs:
          run_with_real_s3: true
          real_s3_bucket: neon-github-ci-tests
          real_s3_region: eu-central-1
-          rerun_failed: ${{ inputs.test-run-count == 1 }}
+          rerun_failed: ${{ inputs.rerun-failed }}
          pg_version: ${{ matrix.pg_version }}
          sanitizers: ${{ inputs.sanitizers }}
          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
          # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
          # Attempt to stop tests gracefully to generate test reports
          # until they are forcibly stopped by the stricter `timeout-minutes` limit.
-          extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
+          extra_params: --session-timeout=${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
                        ${{ inputs.test-selection != '' && format('-k "{0}"', inputs.test-selection) || '' }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -94,11 +94,6 @@ jobs:
        run: |
          make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)

-      - name: Get postgres headers ${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
-
      - name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
@@ -110,7 +105,7 @@ jobs:

  build-walproposer-lib:
    if: |
-      inputs.pg_versions != '[]' || inputs.rebuild_everything ||
+      contains(inputs.pg_versions, 'v17') || inputs.rebuild_everything ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
@@ -140,11 +135,17 @@ jobs:
          name: pg_install--v17
          path: pg_install/v17

+      # `actions/download-artifact` doesn't preserve permissions:
+      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
+      - name: Make pg_install/v*/bin/* executable
+        run: |
+          chmod +x pg_install/v*/bin/*
+
      - name: Cache walproposer-lib
        id: cache_walproposer_lib
        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          path: pg_install/build/walproposer-lib
+          path: build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Checkout submodule vendor/postgres-v17
@@ -167,13 +168,13 @@ jobs:
      - name: Build walproposer-lib (only for v17)
        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
        run:
-          make walproposer-lib -j$(sysctl -n hw.ncpu)
+          make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1

-      - name: Upload "pg_install/build/walproposer-lib" artifact
+      - name: Upload "build/walproposer-lib" artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
-          name: pg_install--build--walproposer-lib
-          path: pg_install/build/walproposer-lib
+          name: build--walproposer-lib
+          path: build/walproposer-lib
          # The artifact is supposed to be used by the next job in the same workflow,
          # so there’s no need to store it for too long.
          retention-days: 1
@@ -226,11 +227,11 @@ jobs:
          name: pg_install--v17
          path: pg_install/v17

-      - name: Download "pg_install/build/walproposer-lib" artifact
+      - name: Download "build/walproposer-lib" artifact
        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
-          name: pg_install--build--walproposer-lib
-          path: pg_install/build/walproposer-lib
+          name: build--walproposer-lib
+          path: build/walproposer-lib

      # `actions/download-artifact` doesn't preserve permissions:
      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
--- a/.github/workflows/build_and_run_selected_test.yml
+++ b/.github/workflows/build_and_run_selected_test.yml
@@ -58,6 +58,7 @@ jobs:
      test-cfg: ${{ inputs.pg-versions }}
      test-selection: ${{ inputs.test-selection }}
      test-run-count: ${{ fromJson(inputs.run-count) }}
+      rerun-failed: false
    secrets: inherit

  create-test-report:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:
          submodules: true

      - name: Check for file changes
-        uses: step-security/paths-filter@v3
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
        id: files-changed
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
@@ -199,6 +199,28 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
    secrets: inherit

+  validate-compute-manifest:
+    runs-on: ubuntu-22.04
+    needs: [ meta, check-permissions ]
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        with:
+          node-version: '24'
+
+      - name: Validate manifest against schema
+        run: |
+          make -C compute manifest-schema-validation
+
  build-and-test-locally:
    needs: [ meta, build-build-tools-image ]
    # We do need to run this in `.*-rc-pr` because of hotfixes.
@@ -648,7 +670,7 @@ jobs:
                                             ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64

  compute-node-image-arch:
-    needs: [ check-permissions, build-build-tools-image, meta ]
+    needs: [ check-permissions, meta ]
    if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
@@ -721,7 +743,6 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
@@ -741,7 +762,6 @@ jobs:
            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
            PG_VERSION=${{ matrix.version.pg }}
            BUILD_TAG=${{ needs.meta.outputs.release-tag || needs.meta.outputs.build-tag }}
-            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }}
            DEBIAN_VERSION=${{ matrix.version.debian }}
          provenance: false
          push: true
--- a/.github/workflows/build_and_test_fully.yml
+++ b/.github/workflows/build_and_test_fully.yml
@@ -0,0 +1,151 @@
+name: Build and Test Fully
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  tag:
+    runs-on: [ self-hosted, small ]
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+        id: build-tag
+
+  build-build-tools-image:
+    uses: ./.github/workflows/build-build-tools-image.yml
+    secrets: inherit
+
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+        build-type: [ debug, release ]
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+      rerun-failed: false
+      test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                  {"pg_version":"v15", "lfc_state": "with-lfc"},
+                  {"pg_version":"v16", "lfc_state": "with-lfc"},
+                  {"pg_version":"v17", "lfc_state": "with-lfc"},
+                  {"pg_version":"v14", "lfc_state": "without-lfc"},
+                  {"pg_version":"v15", "lfc_state": "without-lfc"},
+                  {"pg_version":"v16", "lfc_state": "without-lfc"},
+                  {"pg_version":"v17", "lfc_state": "withouts-lfc"}]'
+    secrets: inherit
+
+
+  create-test-report:
+    needs: [ build-and-test-locally, build-build-tools-image ]
+    if: ${{ !cancelled() }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
+
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        if: ${{ !cancelled() }}
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const coverage = {}
+
+            const script = require("./scripts/comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+              coverage,
+            })
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -79,6 +79,7 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
+      rerun-failed: false
      test-cfg: '[{"pg_version":"v17"}]'
      sanitizers: enabled
    secrets: inherit
--- a/.github/workflows/large_oltp_benchmark.yml
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -33,11 +33,19 @@ jobs:
      fail-fast: false # allow other variants to continue even if one fails
      matrix:
        include:
+          # test only read-only custom scripts in new branch without database maintenance
+          - target: new_branch
+            custom_scripts: select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3
+            test_maintenance: false
+          # test all custom scripts in new branch with database maintenance
          - target: new_branch
            custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
+            test_maintenance: true
+          # test all custom scripts in reuse branch with database maintenance
          - target: reuse_branch
            custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
-      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
+            test_maintenance: true
+      max-parallel: 1 # we want to run each benchmark sequentially to not have noisy neighbors on shared storage (PS, SK)
    permissions:
      contents: write
      statuses: write
@@ -145,6 +153,7 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Benchmark database maintenance
+      if: ${{ matrix.test_maintenance }}
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
--- a/.github/workflows/large_oltp_growth.yml
+++ b/.github/workflows/large_oltp_growth.yml
@@ -0,0 +1,175 @@
+name: large oltp growth
+# workflow to grow the reuse branch of large oltp benchmark continuously (about 16 GB per run)
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #  branches: [ bodobolero/increase_large_oltp_workload ]
+
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #        ┌───────────── minute (0 - 59)
+    #        │ ┌───────────── hour (0 - 23)
+    #        │ │  ┌───────────── day of the month (1 - 31)
+    #        │ │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #        │ │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron: '0 6 * * *'   # 06:00 UTC
+    - cron: '0 8 * * *'   # 08:00 UTC
+    - cron: '0 10 * * *'  # 10:00 UTC
+    - cron: '0 12 * * *'  # 12:00 UTC
+    - cron: '0 14 * * *'  # 14:00 UTC
+    - cron: '0 16 * * *'  # 16:00 UTC
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: large-oltp-growth
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  oltp:
+    strategy:
+      fail-fast: false # allow other variants to continue even if one fails
+      matrix:
+        include:
+          # for now only grow the reuse branch, not the other branches.
+          - target: reuse_branch
+            custom_scripts:
+            - grow_action_blocks.sql
+            - grow_action_kwargs.sql
+            - grow_device_fingerprint_event.sql
+            - grow_edges.sql
+            - grow_hotel_rate_mapping.sql
+            - grow_ocr_pipeline_results_version.sql
+            - grow_priceline_raw_response.sql
+            - grow_relabled_transactions.sql
+            - grow_state_values.sql
+            - grow_values.sql
+            - grow_vertices.sql
+            - update_accounting_coding_body_tracking_category_selection.sql
+            - update_action_blocks.sql
+            - update_action_kwargs.sql
+            - update_denormalized_approval_workflow.sql
+            - update_device_fingerprint_event.sql
+            - update_edges.sql
+            - update_heron_transaction_enriched_log.sql
+            - update_heron_transaction_enrichment_requests.sql
+            - update_hotel_rate_mapping.sql
+            - update_incoming_webhooks.sql
+            - update_manual_transaction.sql
+            - update_ml_receipt_matching_log.sql
+            - update_ocr_pipeine_results_version.sql
+            - update_orc_pipeline_step_results.sql
+            - update_orc_pipeline_step_results_version.sql
+            - update_priceline_raw_response.sql
+            - update_quickbooks_transactions.sql
+            - update_raw_finicity_transaction.sql
+            - update_relabeled_transactions.sql
+            - update_state_values.sql
+            - update_stripe_authorization_event_log.sql
+            - update_transaction.sql
+            - update_values.sql
+            - update_vertices.sql
+      max-parallel: 1 # we want to run each growth workload sequentially (for now there is just one)
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "1h"
+      TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ join(matrix.custom_scripts, ' ') }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      PG_VERSION: 16 # pre-determined by pre-determined project
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      PLATFORM: ${{ matrix.target }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        case "${{ matrix.target }}" in
+          reuse_branch)
+          CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
+          ;;
+          *)
+          echo >&2 "Unknown target=${{ matrix.target }}"
+          exit 1
+          ;;
+        esac
+
+        CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}"
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+        echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT
+
+    - name: pgbench with custom-scripts
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: true
+        extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_growth
+        pg_version: ${{ env.PG_VERSION }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Periodic large oltp tenant growth increase: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -53,7 +53,7 @@ jobs:
          submodules: true

      - name: Check for Postgres changes
-        uses: step-security/paths-filter@v3
+        uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242  #v3
        id: files_changed
        with:
          token: ${{ github.token }}
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -34,7 +34,7 @@ jobs:

      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+      - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        id: python-src
        with:
          files: |
@@ -45,7 +45,7 @@ jobs:
            poetry.lock
            pyproject.toml

-      - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+      - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        id: rust-src
        with:
          files: |
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -0,0 +1,84 @@
+name: Periodic proxy performance test on unit-perf hetzner runner
+
+on:
+  push: # TODO: remove after testing
+    branches:
+      - test-proxy-bench # Runs on pushes to branches starting with test-proxy-bench
+  # schedule:
+    # * is a special character in YAML so you have to quote this string
+    #        ┌───────────── minute (0 - 59)
+    #        │ ┌───────────── hour (0 - 23)
+    #        │ │ ┌───────────── day of the month (1 - 31)
+    #        │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #        │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    # - cron: '0 5 * * *' # Runs at 5 UTC once a day
+  workflow_dispatch: # adds an ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  run_periodic_proxybench_test:
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    runs-on: [self-hosted, unit-perf]
+    timeout-minutes: 60  # 1h timeout
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    steps:
+    - name: Checkout proxy-bench Repo
+      uses: actions/checkout@v4
+      with:
+        repository: neondatabase/proxy-bench
+        path: proxy-bench
+
+    - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
+      id: set-env
+      shell: bash -euxo pipefail {0}
+      run: |
+        PROXY_BENCH_PATH=$(realpath ./proxy-bench)
+        {
+          echo "PROXY_BENCH_PATH=$PROXY_BENCH_PATH"
+          echo "NEON_DIR=${RUNNER_TEMP}/neon"
+          echo "TEST_OUTPUT=${PROXY_BENCH_PATH}/test_output"
+          echo ""
+        } >> "$GITHUB_ENV"
+
+    - name: Run proxy-bench
+      run: ${PROXY_BENCH_PATH}/run.sh
+
+    - name: Ingest Bench Results # neon repo script
+      if: always()
+      run: |
+        mkdir -p $TEST_OUTPUT
+        python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT
+
+    - name: Push Metrics to Proxy perf database
+      if: always()
+      env:
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}"
+        REPORT_FROM: $TEST_OUTPUT
+      run: $NEON_DIR/scripts/generate_and_push_perf_report.sh
+
+    - name: Docker cleanup
+      if: always()
+      run: docker compose down
+
+    - name: Notify Failure
+      if: failure()
+      run: echo "Proxy bench job failed" && exit 1
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 /artifact_cache
+/build
 /pg_install
 /target
 /tmp_check
@@ -13,6 +14,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
+pgxn/neon/communicator/communicator_bindings.h

 # Coverage
 *.profraw
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ members = [
    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
+    "pageserver/client_grpc",
    "pageserver/pagebench",
    "pageserver/page_api",
    "proxy",
@@ -22,6 +23,8 @@ members = [
    "libs/http-utils",
    "libs/pageserver_api",
    "libs/postgres_ffi",
+    "libs/postgres_ffi_types",
+    "libs/postgres_versioninfo",
    "libs/safekeeper_api",
    "libs/desim",
    "libs/neon-shmem",
@@ -32,6 +35,7 @@ members = [
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
+    "libs/neonart",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
@@ -44,6 +48,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -87,6 +92,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
+crossbeam-utils = "0.8.21"
 crc32c = "0.6"
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
@@ -145,11 +151,13 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
+peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13.5"
+prost-types = "0.13.5"
 rand = "0.8"
 redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
@@ -173,12 +181,14 @@ serde_json = "1"
 serde_path_to_error = "0.1"
 serde_with = { version = "3", features = [ "base64" ] }
 serde_assert = "0.5.0"
+serde_repr = "0.1.20"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 spki = "0.7.3"
+spin = "0.9.8"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
@@ -190,16 +200,15 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7.10", features = ["io", "rt"] }
+tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "prost", "router", "server", "tls-ring", "tls-native-roots"] }
+tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] }
 tonic-reflection = { version = "0.13.1", features = ["server"] }
 tower = { version = "0.5.2", default-features = false }
 tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
@@ -232,6 +241,9 @@ x509-cert = { version = "0.2.5" }
 env_logger = "0.11"
 log = "0.4"

+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -251,14 +263,19 @@ desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neonart = { version = "0.1", path = "./libs/neonart/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
+postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" }
+postgres_versioninfo = { version = "0.1", path = "./libs/postgres_versioninfo/" }
 postgres_initdb = { path = "./libs/postgres_initdb" }
 posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
@@ -278,6 +295,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/10
+++ b/10
@@ -5,8 +5,6 @@
 ARG REPOSITORY=ghcr.io/neondatabase
 ARG IMAGE=build-tools
 ARG TAG=pinned
-ARG DEFAULT_PG_VERSION=17
-ARG STABLE_PG_VERSION=16
 ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim

@@ -42,12 +40,12 @@ COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
+COPY --chown=nonroot postgres.mk postgres.mk
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh

 ENV BUILD_TYPE=release
 RUN set -e \
    && mold -run make -j $(nproc) -s neon-pg-ext \
-    && rm -rf pg_install/build \
    && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .

 # Prepare cargo-chef recipe
@@ -63,14 +61,11 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
 ARG BUILD_TAG
-ARG STABLE_PG_VERSION

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
-COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
-COPY --from=pg-build /home/nonroot/pg_install/v17/lib                       pg_install/v17/lib
 COPY --from=plan     /home/nonroot/recipe.json                              recipe.json

 ARG ADDITIONAL_RUSTFLAGS=""
@@ -97,7 +92,6 @@ RUN set -e \
 # Build final image
 #
 FROM $BASE_IMAGE_SHA
-ARG DEFAULT_PG_VERSION
 WORKDIR /data

 RUN set -e \
@@ -107,8 +101,6 @@ RUN set -e \
        libreadline-dev \
        libseccomp-dev \
        ca-certificates \
-	# System postgres for use with client libraries (e.g. in storage controller)
-        postgresql-15 \
        openssl \
        unzip \
        curl \
--- a/256
+++ b/256
@@ -1,8 +1,21 @@
 ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

-# Where to install Postgres, default is ./pg_install, maybe useful for package managers
+# Where to install Postgres, default is ./pg_install, maybe useful for package
+# managers.
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

+# Supported PostgreSQL versions
+POSTGRES_VERSIONS = v17 v16 v15 v14
+
+# CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked`
+# and `--features testing` are popular examples.
+#
+# CARGO_PROFILE: Set to override the cargo profile to use. By default,
+# it is derived from BUILD_TYPE.
+
+# All intermediate build artifacts are stored here.
+BUILD_DIR := build
+
 ICU_PREFIX_DIR := /usr/local/icu

 #
@@ -16,12 +29,19 @@ ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
 	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
-	# Unfortunately, `--profile=...` is a nightly feature
-	CARGO_BUILD_FLAGS += --release
+	CARGO_PROFILE ?= --profile=release
+	# NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places
+	# the final build artifacts. There is unfortunately no easy way of changing
+	# it to a fully predictable path, nor to extract the path with a simple
+	# command. See https://github.com/rust-lang/cargo/issues/9661 and
+	# https://github.com/rust-lang/cargo/issues/6790.
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
+	CARGO_PROFILE ?= --profile=dev
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -85,141 +105,32 @@ CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 # Top level Makefile to build Neon and PostgreSQL
 #
 .PHONY: all
-all: neon postgres neon-pg-ext
+all: neon postgres-install neon-pg-ext

 ### Neon Rust bits
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib cargo-target-dir
+neon: postgres-headers-install walproposer-lib cargo-target-dir
 	+@echo "Compiling Neon"
-	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
+	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)
+
 .PHONY: cargo-target-dir
 cargo-target-dir:
 	# https://github.com/rust-lang/cargo/issues/14281
 	mkdir -p target
 	test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG

-### PostgreSQL parts
-# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
-# to avoid the duplication in the future, but it's tolerable for now.
-#
-$(POSTGRES_INSTALL_DIR)/build/%/config.status:
-
-	mkdir -p $(POSTGRES_INSTALL_DIR)
-	test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
-
-	+@echo "Configuring Postgres $* build"
-	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
-		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
-		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
-		exit 1; }
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
-
-	VERSION=$*; \
-	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
-	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
-		CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
-		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
-
-# nicer alias to run 'configure'
-# Note: I've been unable to use templates for this part of our configuration.
-# I'm not sure why it wouldn't work, but this is the only place (apart from
-# the "build-all-versions" entry points) where direct mention of PostgreSQL
-# versions is used.
-.PHONY: postgres-configure-v17
-postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status
-.PHONY: postgres-configure-v16
-postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
-.PHONY: postgres-configure-v15
-postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
-.PHONY: postgres-configure-v14
-postgres-configure-v14: $(POSTGRES_INSTALL_DIR)/build/v14/config.status
-
-# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
-.PHONY: postgres-headers-%
-postgres-headers-%: postgres-configure-%
-	+@echo "Installing PostgreSQL $* headers"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/include MAKELEVEL=0 install
-
-# Compile and install PostgreSQL
-.PHONY: postgres-%
-postgres-%: postgres-configure-% \
-		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 install
-	+@echo "Compiling libpq $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
-	+@echo "Compiling pg_visibility $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
-	+@echo "Compiling pageinspect $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
-	+@echo "Compiling pg_trgm $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
-	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
-	+@echo "Compiling test_decoding $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install
-
-.PHONY: postgres-clean-%
-postgres-clean-%:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/interfaces/libpq clean
-
-.PHONY: postgres-check-%
-postgres-check-%: postgres-%
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$* MAKELEVEL=0 check
-
 .PHONY: neon-pg-ext-%
-neon-pg-ext-%: postgres-%
-	+@echo "Compiling neon $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
-	+@echo "Compiling neon_walredo $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
-	+@echo "Compiling neon_rmgr $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
-	+@echo "Compiling neon_test_utils $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
-	+@echo "Compiling neon_utils $*"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
-
-.PHONY: neon-pg-clean-ext-%
-neon-pg-clean-ext-%:
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
+neon-pg-ext-%: postgres-install-%
+	+@echo "Compiling neon-specific Postgres extensions for $*"
+	mkdir -p $(BUILD_DIR)/pgxn-$*
+	$(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \
+		NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \
+		CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \
+		CARGO_PROFILE="$(CARGO_PROFILE)" \
+		-C $(BUILD_DIR)/pgxn-$*\
+		-f $(ROOT_PROJECT_DIR)/pgxn/Makefile  install

 # Build walproposer as a static library. walproposer source code is located
 # in the pgxn/neon directory.
@@ -233,15 +144,15 @@ neon-pg-clean-ext-%:
 .PHONY: walproposer-lib
 walproposer-lib: neon-pg-ext-v17
 	+@echo "Compiling walproposer-lib"
-	mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
+	mkdir -p $(BUILD_DIR)/walproposer-lib
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
-		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
+		-C $(BUILD_DIR)/walproposer-lib \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(BUILD_DIR)/walproposer-lib
+	cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(BUILD_DIR)/walproposer-lib
+	$(AR) d $(BUILD_DIR)/walproposer-lib/libpgport.a \
 		pg_strong_random.o
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+	$(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \
 		checksum_helper.o \
 		cryptohash_openssl.o \
 		hmac_openssl.o \
@@ -249,69 +160,18 @@ walproposer-lib: neon-pg-ext-v17
 		parse_manifest.o \
 		scram-common.o
 ifeq ($(UNAME_S),Linux)
-	$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
+	$(AR) d $(BUILD_DIR)/walproposer-lib/libpgcommon.a \
 		pg_crc32c.o
 endif

-.PHONY: walproposer-lib-clean
-walproposer-lib-clean:
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \
-		-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
-		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-
+# Shorthand to call neon-pg-ext-% target for all Postgres versions
 .PHONY: neon-pg-ext
-neon-pg-ext: \
-	neon-pg-ext-v14 \
-	neon-pg-ext-v15 \
-	neon-pg-ext-v16 \
-	neon-pg-ext-v17
-
-.PHONY: neon-pg-clean-ext
-neon-pg-clean-ext: \
-	neon-pg-clean-ext-v14 \
-	neon-pg-clean-ext-v15 \
-	neon-pg-clean-ext-v16 \
-	neon-pg-clean-ext-v17
-
-# shorthand to build all Postgres versions
-.PHONY: postgres
-postgres: \
-	postgres-v14 \
-	postgres-v15 \
-	postgres-v16 \
-	postgres-v17
-
-.PHONY: postgres-headers
-postgres-headers: \
-	postgres-headers-v14 \
-	postgres-headers-v15 \
-	postgres-headers-v16 \
-	postgres-headers-v17
-
-.PHONY: postgres-clean
-postgres-clean: \
-	postgres-clean-v14 \
-	postgres-clean-v15 \
-	postgres-clean-v16 \
-	postgres-clean-v17
-
-.PHONY: postgres-check
-postgres-check: \
-	postgres-check-v14 \
-	postgres-check-v15 \
-	postgres-check-v16 \
-	postgres-check-v17
-
-# This doesn't remove the effects of 'configure'.
-.PHONY: clean
-clean: postgres-clean neon-pg-clean-ext
-	$(MAKE) -C compute clean
-	$(CARGO_CMD_PREFIX) cargo clean
+neon-pg-ext: $(foreach pg_version,$(POSTGRES_VERSIONS),neon-pg-ext-$(pg_version))

 # This removes everything
 .PHONY: distclean
 distclean:
-	$(RM) -r $(POSTGRES_INSTALL_DIR)
+	$(RM) -r $(POSTGRES_INSTALL_DIR) $(BUILD_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
@@ -320,7 +180,7 @@ fmt:

 postgres-%-pg-bsd-indent: postgres-%
 	+@echo "Compiling pg_bsd_indent"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
+	$(MAKE) -C $(BUILD_DIR)/$*/src/tools/pg_bsd_indent/

 # Create typedef list for the core. Note that generally it should be combined with
 # buildfarm one to cover platform specific stuff.
@@ -339,7 +199,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
 		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
 	+@echo note: you might want to run it on selected files/dirs instead.
-	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
+	INDENT=$(BUILD_DIR)/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
 		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
@@ -350,12 +210,28 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
 		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
-		INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
+		INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
 		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
-		-C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \
+		-C $(BUILD_DIR)/neon-v17 \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent


 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
+
+# Targets for building PostgreSQL are defined in postgres.mk.
+#
+# But if the caller has indicated that PostgreSQL is already
+# installed, by setting the PG_INSTALL_CACHED variable, skip it.
+ifdef PG_INSTALL_CACHED
+postgres-install: skip-install
+$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-install-$(pg_version)): skip-install
+postgres-headers-install:
+	+@echo "Skipping installation of PostgreSQL headers because PG_INSTALL_CACHED is set"
+skip-install:
+	+@echo "Skipping PostgreSQL installation because PG_INSTALL_CACHED is set"
+
+else
+include postgres.mk
+endif
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -165,6 +165,7 @@ RUN curl -fsSL \
    && rm sql_exporter.tar.gz

 # protobuf-compiler (protoc)
+# Keep the version the same as in compute/compute-node.Dockerfile
 ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
    && unzip -q protoc.zip -d protoc \
@@ -179,7 +180,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
    && mv s5cmd /usr/local/bin/s5cmd

 # LLVM
-ENV LLVM_VERSION=19
+ENV LLVM_VERSION=20
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
@@ -292,7 +293,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.87.0
+ENV RUSTC_VERSION=1.88.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/.gitignore
+++ b/compute/.gitignore
@@ -3,3 +3,6 @@ etc/neon_collector.yml
 etc/neon_collector_autoscaling.yml
 etc/sql_exporter.yml
 etc/sql_exporter_autoscaling.yml
+
+# Node.js dependencies
+node_modules/
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -22,7 +22,7 @@ sql_exporter.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
-		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter&pgaudit.log=none' \
 		etc/sql_exporter.jsonnet

 sql_exporter_autoscaling.yml: $(jsonnet_files)
@@ -30,7 +30,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
-		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling&pgaudit.log=none' \
 		etc/sql_exporter.jsonnet

 .PHONY: clean
@@ -48,3 +48,11 @@ jsonnetfmt-test:
 .PHONY: jsonnetfmt-format
 jsonnetfmt-format:
 	jsonnetfmt --in-place $(jsonnet_files)
+
+.PHONY: manifest-schema-validation
+manifest-schema-validation: node_modules
+	node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
+
+node_modules: package.json
+	npm install
+	touch node_modules
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -77,9 +77,6 @@
 # build_and_test.yml github workflow for how that's done.

 ARG PG_VERSION
-ARG REPOSITORY=ghcr.io/neondatabase
-ARG IMAGE=build-tools
-ARG TAG=pinned
 ARG BUILD_TAG
 ARG DEBIAN_VERSION=bookworm
 ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
@@ -118,6 +115,9 @@ ARG EXTENSIONS=all
 FROM $BASE_IMAGE_SHA AS build-deps
 ARG DEBIAN_VERSION

+# Keep in sync with build-tools.Dockerfile
+ENV PROTOC_VERSION=25.1
+
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

@@ -149,8 +149,17 @@ RUN case $DEBIAN_VERSION in \
    ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
+    libclang-dev \
+    jsonnet \
    $VERSION_INSTALLS \
-    && apt clean && rm -rf /var/lib/apt/lists/*
+    && apt clean && rm -rf /var/lib/apt/lists/* \
+    && useradd -ms /bin/bash nonroot -b /home \
+    # Install protoc from binary release, since Debian's versions are too old.
+    && curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
+    && unzip -q protoc.zip -d protoc \
+    && mv protoc/bin/protoc /usr/local/bin/protoc \
+    && mv protoc/include/google /usr/local/include/google \
+    && rm -rf protoc.zip protoc

 #########################################################################################
 #
@@ -171,9 +180,6 @@ RUN cd postgres && \
    eval $CONFIGURE_CMD && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
    # Enable some of contrib extensions
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
@@ -1057,17 +1063,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \

 #########################################################################################
 #
-# Layer "pg build with nonroot user and cargo installed"
-# This layer is base and common for layers with `pgrx`
+# Layer "build-deps with Rust toolchain installed"
 #
 #########################################################################################
-FROM pg-build AS pg-build-nonroot-with-cargo
-ARG PG_VERSION
-
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
-    apt clean && rm -rf /var/lib/apt/lists/* && \
-    useradd -ms /bin/bash nonroot -b /home
+FROM build-deps AS build-deps-with-cargo

 ENV HOME=/home/nonroot
 ENV PATH="/home/nonroot/.cargo/bin:$PATH"
@@ -1082,13 +1081,29 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init

+#########################################################################################
+#
+# Layer "pg-build with Rust toolchain installed"
+# This layer is base and common for layers with `pgrx`
+#
+#########################################################################################
+FROM pg-build AS pg-build-with-cargo
+ARG PG_VERSION
+
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
+USER nonroot
+WORKDIR /home/nonroot
+
+COPY --from=build-deps-with-cargo /home/nonroot /home/nonroot
+
 #########################################################################################
 #
 # Layer "rust extensions"
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build
+FROM pg-build-with-cargo AS rust-extensions-build
 ARG PG_VERSION

 RUN case "${PG_VERSION:?}" in \
@@ -1110,7 +1125,7 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12
+FROM pg-build-with-cargo AS rust-extensions-build-pgrx12
 ARG PG_VERSION

 RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
@@ -1127,7 +1142,7 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
+FROM pg-build-with-cargo AS rust-extensions-build-pgrx14
 ARG PG_VERSION

 RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
@@ -1144,10 +1159,12 @@ USER root

 FROM build-deps AS pgrag-src
 ARG PG_VERSION
-
 WORKDIR /ext-src
+COPY compute/patches/onnxruntime.patch .
+
 RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /ext-src/onnxruntime.patch && \
    echo "#nothing to test here" > neon-test.sh

 RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz &&  \
@@ -1162,7 +1179,7 @@ COPY --from=pgrag-src /ext-src/ /ext-src/
 # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
 WORKDIR /ext-src/onnxruntime-src
 RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
-    python3 python3-pip python3-venv protobuf-compiler && \
+    python3 python3-pip python3-venv && \
    apt clean && rm -rf /var/lib/apt/lists/* && \
    python3 -m venv venv && \
    . venv/bin/activate && \
@@ -1557,20 +1574,20 @@ ARG PG_VERSION
 WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
    "v14") \
-    export PGAUDIT_VERSION=1.6.2 \
-    export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
+    export PGAUDIT_VERSION=1.6.3 \
+    export PGAUDIT_CHECKSUM=37a8f5a7cc8d9188e536d15cf0fdc457fcdab2547caedb54442c37f124110919 \
    ;; \
    "v15") \
-    export PGAUDIT_VERSION=1.7.0 \
-    export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
+    export PGAUDIT_VERSION=1.7.1 \
+    export PGAUDIT_CHECKSUM=e9c8e6e092d82b2f901d72555ce0fe7780552f35f8985573796cd7e64b09d4ec \
    ;; \
    "v16") \
-    export PGAUDIT_VERSION=16.0 \
-    export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
+    export PGAUDIT_VERSION=16.1 \
+    export PGAUDIT_CHECKSUM=3bae908ab70ba0c6f51224009dbcfff1a97bd6104c6273297a64292e1b921fee \
    ;; \
    "v17") \
-    export PGAUDIT_VERSION=17.0 \
-    export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
+    export PGAUDIT_VERSION=17.1 \
+    export PGAUDIT_CHECKSUM=9c5f37504d393486cc75d2ced83f75f5899be64fa85f689d6babb833b4361e6c \
    ;; \
    *) \
    echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
@@ -1621,18 +1638,7 @@ FROM pg-build AS neon-ext-build
 ARG PG_VERSION

 COPY pgxn/ pgxn/
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        -C pgxn/neon \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        -C pgxn/neon_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        -C pgxn/neon_test_utils \
-        -s install && \
-    make -j $(getconf _NPROCESSORS_ONLN) \
-        -C pgxn/neon_rmgr \
-        -s install
+RUN make -j $(getconf _NPROCESSORS_ONLN) -C pgxn -s install-compute

 #########################################################################################
 #
@@ -1722,7 +1728,7 @@ FROM extensions-${EXTENSIONS} AS neon-pg-ext-build
 # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries
 #
 #########################################################################################
-FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
+FROM build-deps-with-cargo AS compute-tools
 ARG BUILD_TAG
 ENV BUILD_TAG=$BUILD_TAG

@@ -1732,7 +1738,7 @@ COPY --chown=nonroot . .
 RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
    --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \
    --mount=type=cache,uid=1000,target=/home/nonroot/target \
-    mold -run cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \
+    cargo build --locked --profile release-line-debug-size-lto --bin compute_ctl --bin fast_import --bin local_proxy && \
    mkdir target-bin && \
    cp target/release-line-debug-size-lto/compute_ctl \
       target/release-line-debug-size-lto/fast_import \
@@ -1826,10 +1832,11 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Preprocess the sql_exporter configuration files
 #
 #########################################################################################
-FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
+FROM build-deps AS sql_exporter_preprocessor
 ARG PG_VERSION

 USER nonroot
+WORKDIR /home/nonroot

 COPY --chown=nonroot compute compute

--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -21,6 +21,8 @@ unix_socket_dir=/tmp/
 unix_socket_mode=0777
 ; required for pgbouncer_exporter
 ignore_startup_parameters=extra_float_digits
+; pidfile for graceful termination
+pidfile=/tmp/pgbouncer.pid

 ;; Disable connection logging. It produces a lot of logs that no one looks at,
 ;; and we can get similar log entries from the proxy too. We had incidents in
--- a/compute/manifest.schema.json
+++ b/compute/manifest.schema.json
@@ -0,0 +1,209 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "Neon Compute Manifest Schema",
+  "description": "Schema for Neon compute node configuration manifest",
+  "type": "object",
+  "properties": {
+    "pg_settings": {
+      "type": "object",
+      "properties": {
+        "common": {
+          "type": "object",
+          "properties": {
+            "client_connection_check_interval": {
+              "type": "string",
+              "description": "Check for client disconnection interval in milliseconds"
+            },
+            "effective_io_concurrency": {
+              "type": "string",
+              "description": "Effective IO concurrency setting"
+            },
+            "fsync": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to force fsync to disk"
+            },
+            "hot_standby": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether hot standby is enabled"
+            },
+            "idle_in_transaction_session_timeout": {
+              "type": "string",
+              "description": "Timeout for idle transactions in milliseconds"
+            },
+            "listen_addresses": {
+              "type": "string",
+              "description": "Addresses to listen on"
+            },
+            "log_connections": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to log connections"
+            },
+            "log_disconnections": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to log disconnections"
+            },
+            "log_temp_files": {
+              "type": "string",
+              "description": "Size threshold for logging temporary files in KB"
+            },
+            "log_error_verbosity": {
+              "type": "string",
+              "enum": ["terse", "verbose", "default"],
+              "description": "Error logging verbosity level"
+            },
+            "log_min_error_statement": {
+              "type": "string",
+              "description": "Minimum error level for statement logging"
+            },
+            "maintenance_io_concurrency": {
+              "type": "string",
+              "description": "Maintenance IO concurrency setting"
+            },
+            "max_connections": {
+              "type": "string",
+              "description": "Maximum number of connections"
+            },
+            "max_replication_flush_lag": {
+              "type": "string",
+              "description": "Maximum replication flush lag"
+            },
+            "max_replication_slots": {
+              "type": "string",
+              "description": "Maximum number of replication slots"
+            },
+            "max_replication_write_lag": {
+              "type": "string",
+              "description": "Maximum replication write lag"
+            },
+            "max_wal_senders": {
+              "type": "string",
+              "description": "Maximum number of WAL senders"
+            },
+            "max_wal_size": {
+              "type": "string",
+              "description": "Maximum WAL size"
+            },
+            "neon.unstable_extensions": {
+              "type": "string",
+              "description": "List of unstable extensions"
+            },
+            "neon.protocol_version": {
+              "type": "string",
+              "description": "Neon protocol version"
+            },
+            "password_encryption": {
+              "type": "string",
+              "description": "Password encryption method"
+            },
+            "restart_after_crash": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to restart after crash"
+            },
+            "superuser_reserved_connections": {
+              "type": "string",
+              "description": "Number of reserved connections for superuser"
+            },
+            "synchronous_standby_names": {
+              "type": "string",
+              "description": "Names of synchronous standby servers"
+            },
+            "wal_keep_size": {
+              "type": "string",
+              "description": "WAL keep size"
+            },
+            "wal_level": {
+              "type": "string",
+              "description": "WAL level"
+            },
+            "wal_log_hints": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to log hints in WAL"
+            },
+            "wal_sender_timeout": {
+              "type": "string",
+              "description": "WAL sender timeout in milliseconds"
+            }
+          },
+          "required": [
+            "client_connection_check_interval",
+            "effective_io_concurrency",
+            "fsync",
+            "hot_standby",
+            "idle_in_transaction_session_timeout",
+            "listen_addresses",
+            "log_connections",
+            "log_disconnections",
+            "log_temp_files",
+            "log_error_verbosity",
+            "log_min_error_statement",
+            "maintenance_io_concurrency",
+            "max_connections",
+            "max_replication_flush_lag",
+            "max_replication_slots",
+            "max_replication_write_lag",
+            "max_wal_senders",
+            "max_wal_size",
+            "neon.unstable_extensions",
+            "neon.protocol_version",
+            "password_encryption",
+            "restart_after_crash",
+            "superuser_reserved_connections",
+            "synchronous_standby_names",
+            "wal_keep_size",
+            "wal_level",
+            "wal_log_hints",
+            "wal_sender_timeout"
+          ]
+        },
+        "replica": {
+          "type": "object",
+          "properties": {
+            "hot_standby": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether hot standby is enabled for replicas"
+            }
+          },
+          "required": ["hot_standby"]
+        },
+        "per_version": {
+          "type": "object",
+          "patternProperties": {
+            "^1[4-7]$": {
+              "type": "object",
+              "properties": {
+                "common": {
+                  "type": "object",
+                  "properties": {
+                    "io_combine_limit": {
+                      "type": "string",
+                      "description": "IO combine limit"
+                    }
+                  }
+                },
+                "replica": {
+                  "type": "object",
+                  "properties": {
+                    "recovery_prefetch": {
+                      "type": "string",
+                      "enum": ["on", "off"],
+                      "description": "Whether to enable recovery prefetch for PostgreSQL replicas"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      },
+      "required": ["common", "replica", "per_version"]
+    }
+  },
+  "required": ["pg_settings"]
+} 
--- a/compute/manifest.yaml
+++ b/compute/manifest.yaml
@@ -105,17 +105,17 @@ pg_settings:
        # Neon hot standby ignores pages that are not in the shared_buffers
        recovery_prefetch: "off"
    16:
-      common:
+      common: {}
      replica:
        # prefetching of blocks referenced in WAL doesn't make sense for us
        # Neon hot standby ignores pages that are not in the shared_buffers
        recovery_prefetch: "off"
    15:
-      common:
+      common: {}
      replica:
        # prefetching of blocks referenced in WAL doesn't make sense for us
        # Neon hot standby ignores pages that are not in the shared_buffers
        recovery_prefetch: "off"
    14:
-      common:
-      replica:
+      common: {}
+      replica: {}
--- a/compute/package-lock.json
+++ b/compute/package-lock.json
@@ -0,0 +1,37 @@
+{
+  "name": "neon-compute",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "neon-compute",
+      "dependencies": {
+        "@sourcemeta/jsonschema": "9.3.4"
+      }
+    },
+    "node_modules/@sourcemeta/jsonschema": {
+      "version": "9.3.4",
+      "resolved": "https://registry.npmjs.org/@sourcemeta/jsonschema/-/jsonschema-9.3.4.tgz",
+      "integrity": "sha512-hkujfkZAIGXUs4U//We9faZW8LZ4/H9LqagRYsFSulH/VLcKPNhZyCTGg7AhORuzm27zqENvKpnX4g2FzudYFw==",
+      "cpu": [
+        "x64",
+        "arm64"
+      ],
+      "license": "AGPL-3.0",
+      "os": [
+        "darwin",
+        "linux",
+        "win32"
+      ],
+      "bin": {
+        "jsonschema": "cli.js"
+      },
+      "engines": {
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sourcemeta"
+      }
+    }
+  }
+}
--- a/compute/package.json
+++ b/compute/package.json
@@ -0,0 +1,7 @@
+{
+  "name": "neon-compute",
+  "private": true,
+  "dependencies": {
+    "@sourcemeta/jsonschema": "9.3.4"
+  }
+} 
--- a/compute/patches/onnxruntime.patch
+++ b/compute/patches/onnxruntime.patch
@@ -0,0 +1,15 @@
+diff --git a/cmake/deps.txt b/cmake/deps.txt
+index d213b09034..229de2ebf0 100644
+--- a/cmake/deps.txt
+++ b/cmake/deps.txt
+@@ -22,7 +22,9 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132
+ # it contains changes on top of 3.4.0 which are required to fix build issues.
+ # Until the 3.4.1 release this is the best option we have.
+ # Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744
+-eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a
+# Moved to github mirror to avoid gitlab issues.Add commentMore actions
+# Issue link: https://github.com/bazelbuild/bazel-central-registry/issues/4355
+eigen;https://github.com/eigen-mirror/eigen/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;61418a349000ba7744a3ad03cf5071f22ebf860a
+ flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c
+ fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
+ fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: pgbouncer-exporter
    user: postgres
    sysvInitAction: respawn
@@ -59,7 +59,7 @@ files:
      # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
      # resolve host" log messages that they generate.
      Defaults !fqdn
-      
+
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
      # regardless of hostname (ALL)
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -26,7 +26,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
  - name: pgbouncer-exporter
    user: postgres
    sysvInitAction: respawn
@@ -59,7 +59,7 @@ files:
      # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to
      # resolve host" log messages that they generate.
      Defaults !fqdn
-      
+
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
      # regardless of hostname (ALL)
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,6 +38,7 @@ once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 p256 = { version = "0.13", features = ["pem"] }
+pageserver_page_api.workspace = true
 postgres.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
@@ -53,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
 tokio-stream.workspace = true
+tonic.workspace = true
 tower-otel.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
@@ -64,6 +66,7 @@ uuid.workspace = true
 walkdir.workspace = true
 x509-cert.workspace = true

+postgres_versioninfo.workspace = true
 postgres_initdb.workspace = true
 compute_api.workspace = true
 utils.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -124,6 +124,10 @@ struct Cli {
    /// Interval in seconds for collecting installed extensions statistics
    #[arg(long, default_value = "3600")]
    pub installed_extensions_collection_interval: u64,
+
+    /// Run in development mode, skipping VM-specific operations like process termination
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub dev: bool,
 }

 impl Cli {
@@ -159,7 +163,7 @@ fn main() -> Result<()> {
        .build()?;
    let _rt_guard = runtime.enter();

-    runtime.block_on(init())?;
+    runtime.block_on(init(cli.dev))?;

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -198,13 +202,13 @@ fn main() -> Result<()> {
    deinit_and_exit(exit_code);
 }

-async fn init() -> Result<()> {
+async fn init(dev_mode: bool) -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
    thread::spawn(move || {
        for sig in signals.forever() {
-            handle_exit_signal(sig);
+            handle_exit_signal(sig, dev_mode);
        }
    });

@@ -263,9 +267,9 @@ fn deinit_and_exit(exit_code: Option<i32>) -> ! {
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
-fn handle_exit_signal(sig: i32) {
+fn handle_exit_signal(sig: i32, dev_mode: bool) {
    info!("received {sig} termination signal");
-    forward_termination_signal();
+    forward_termination_signal(dev_mode);
    exit(1);
 }

--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -29,7 +29,7 @@ use anyhow::{Context, bail};
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
-use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version};
+use compute_tools::extension_server::get_pg_version;
 use nix::unistd::Pid;
 use std::ops::Not;
 use tracing::{Instrument, error, info, info_span, warn};
@@ -179,12 +179,8 @@ impl PostgresProcess {
            .await
            .context("create pgdata directory")?;

-        let pg_version = match get_pg_version(self.pgbin.as_ref()) {
-            PostgresMajorVersion::V14 => 14,
-            PostgresMajorVersion::V15 => 15,
-            PostgresMajorVersion::V16 => 16,
-            PostgresMajorVersion::V17 => 17,
-        };
+        let pg_version = get_pg_version(self.pgbin.as_ref());
+
        postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
            superuser: initdb_user,
            locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
@@ -486,10 +482,8 @@ async fn cmd_pgdata(
    };

    let superuser = "cloud_admin";
-    let destination_connstring = format!(
-        "host=localhost port={} user={} dbname=neondb",
-        pg_port, superuser
-    );
+    let destination_connstring =
+        format!("host=localhost port={pg_port} user={superuser} dbname=neondb");

    let pgdata_dir = workdir.join("pgdata");
    let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone());
--- a/compute_tools/src/bin/fast_import/s3_uri.rs
+++ b/compute_tools/src/bin/fast_import/s3_uri.rs
@@ -69,7 +69,7 @@ impl clap::builder::TypedValueParser for S3Uri {
        S3Uri::from_str(value_str).map_err(|e| {
            clap::Error::raw(
                clap::error::ErrorKind::InvalidValue,
-                format!("Failed to parse S3 URI: {}", e),
+                format!("Failed to parse S3 URI: {e}"),
            )
        })
    }
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -22,7 +22,7 @@ pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<Cat

    spawn(async move {
        if let Err(e) = connection.await {
-            eprintln!("connection error: {}", e);
+            eprintln!("connection error: {e}");
        }
    });

@@ -119,7 +119,7 @@ pub async fn get_database_schema(
        _ => {
            let mut lines = stderr_reader.lines();
            if let Some(line) = lines.next_line().await? {
-                if line.contains(&format!("FATAL:  database \"{}\" does not exist", dbname)) {
+                if line.contains(&format!("FATAL:  database \"{dbname}\" does not exist")) {
                    return Err(SchemaDumpError::DatabaseDoesNotExist);
                }
                warn!("pg_dump stderr: {}", line)
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,4 @@
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
@@ -6,7 +6,8 @@ use compute_api::responses::{
    LfcPrewarmState, TlsConfig,
 };
 use compute_api::spec::{
-    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
+    PageserverShardConnectionInfo, PgIdent,
 };
 use futures::StreamExt;
 use futures::future::join_all;
@@ -15,12 +16,12 @@ use itertools::Itertools;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
 use once_cell::sync::Lazy;
+use pageserver_page_api::{self as page_api, BaseBackupCompression};
 use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
 use std::collections::{HashMap, HashSet};
-use std::net::SocketAddr;
 use std::os::unix::fs::{PermissionsExt, symlink};
 use std::path::Path;
 use std::process::{Command, Stdio};
@@ -35,6 +36,8 @@ use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
+use utils::pid_file;
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};

 use crate::configurator::launch_configurator;
 use crate::disk_quota::set_disk_quota;
@@ -44,6 +47,7 @@ use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use crate::metrics::COMPUTE_CTL_UP;
 use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
+use crate::pgbouncer::*;
 use crate::rsyslog::{
    PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export,
    launch_pgaudit_gc,
@@ -161,6 +165,10 @@ pub struct ComputeState {
    pub lfc_prewarm_state: LfcPrewarmState,
    pub lfc_offload_state: LfcOffloadState,

+    /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
+    /// mode == ComputeMode::Primary. None otherwise
+    pub terminate_flush_lsn: Option<Lsn>,
+
    pub metrics: ComputeMetrics,
 }

@@ -176,6 +184,7 @@ impl ComputeState {
            metrics: ComputeMetrics::default(),
            lfc_prewarm_state: LfcPrewarmState::default(),
            lfc_offload_state: LfcOffloadState::default(),
+            terminate_flush_lsn: None,
        }
    }

@@ -208,13 +217,74 @@ pub struct ParsedSpec {
    pub spec: ComputeSpec,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    pub pageserver_connstr: String,
+    pub pageserver_conninfo: PageserverConnectionInfo,
    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
-    pub endpoint_storage_addr: Option<SocketAddr>,
+    /// k8s dns name and port
+    pub endpoint_storage_addr: Option<String>,
    pub endpoint_storage_token: Option<String>,
 }

+impl ParsedSpec {
+    pub fn validate(&self) -> Result<(), String> {
+        // Only Primary nodes are using safekeeper_connstrings, and at the moment
+        // this method only validates that part of the specs.
+        if self.spec.mode != ComputeMode::Primary {
+            return Ok(());
+        }
+
+        // While it seems like a good idea to check for an odd number of entries in
+        // the safekeepers connection string, changes to the list of safekeepers might
+        // incur appending a new server to a list of 3, in which case a list of 4
+        // entries is okay in production.
+        //
+        // Still we want unique entries, and at least one entry in the vector
+        if self.safekeeper_connstrings.is_empty() {
+            return Err(String::from("safekeeper_connstrings is empty"));
+        }
+
+        // check for uniqueness of the connection strings in the set
+        let mut connstrings = self.safekeeper_connstrings.clone();
+
+        connstrings.sort();
+        let mut previous = &connstrings[0];
+
+        for current in connstrings.iter().skip(1) {
+            // duplicate entry?
+            if current == previous {
+                return Err(format!(
+                    "duplicate entry in safekeeper_connstrings: {current}!",
+                ));
+            }
+
+            previous = current;
+        }
+
+        Ok(())
+    }
+}
+
+fn extract_pageserver_conninfo_from_guc(
+    pageserver_connstring_guc: &str,
+) -> PageserverConnectionInfo {
+    PageserverConnectionInfo {
+        shards: pageserver_connstring_guc
+            .split(',')
+            .enumerate()
+            .map(|(i, connstr)| {
+                (
+                    i as u32,
+                    PageserverShardConnectionInfo {
+                        libpq_url: Some(connstr.to_string()),
+                        grpc_url: None,
+                    },
+                )
+            })
+            .collect(),
+        prefer_grpc: false,
+    }
+}
+
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
@@ -224,11 +294,17 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        // For backwards-compatibility, the top-level fields in the spec file
        // may be empty. In that case, we need to dig them from the GUCs in the
        // cluster.settings field.
-        let pageserver_connstr = spec
-            .pageserver_connstring
-            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
-            .ok_or("pageserver connstr should be provided")?;
+        let pageserver_conninfo = match &spec.pageserver_connection_info {
+            Some(x) => x.clone(),
+            None => {
+                if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
+                    extract_pageserver_conninfo_from_guc(&guc)
+                } else {
+                    return Err("pageserver connstr should be provided".to_string());
+                }
+            }
+        };
+
        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
            if matches!(spec.mode, ComputeMode::Primary) {
                spec.cluster
@@ -244,6 +320,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        } else {
            spec.safekeeper_connstrings.clone()
        };
+
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -266,28 +343,29 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
                .or(Err("invalid timeline id"))?
        };

-        let endpoint_storage_addr: Option<SocketAddr> = spec
+        let endpoint_storage_addr: Option<String> = spec
            .endpoint_storage_addr
            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"))
-            .unwrap_or_default()
-            .parse()
-            .ok();
+            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"));
        let endpoint_storage_token = spec
            .endpoint_storage_token
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token"));

-        Ok(ParsedSpec {
+        let res = ParsedSpec {
            spec,
-            pageserver_connstr,
+            pageserver_conninfo,
            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
            timeline_id,
            endpoint_storage_addr,
            endpoint_storage_token,
-        })
+        };
+
+        // Now check validity of the parsed specification
+        res.validate()?;
+        Ok(res)
    }
 }

@@ -354,14 +432,11 @@ impl ComputeNode {
        // that can affect `compute_ctl` and prevent it from properly configuring the database schema.
        // Unset them via connection string options before connecting to the database.
        // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
-        //
-        // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane
-        // as well. After rolling out this code, we can remove this parameter from control plane.
-        // In the meantime, double-passing is fine, the last value is applied.
-        // See: <https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70>
-        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
+        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
        let options = match conn_conf.get_options() {
-            Some(options) => format!("{} {}", options, EXTRA_OPTIONS),
+            // Allow the control plane to override any options set by the
+            // compute
+            Some(options) => format!("{EXTRA_OPTIONS} {options}"),
            None => EXTRA_OPTIONS.to_string(),
        };
        conn_conf.options(&options);
@@ -369,7 +444,7 @@ impl ComputeNode {

        let mut new_state = ComputeState::new();
        if let Some(spec) = config.spec {
-            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -489,12 +564,21 @@ impl ComputeNode {
        // Reap the postgres process
        delay_exit |= this.cleanup_after_postgres_exit()?;

+        // /terminate returns LSN. If we don't sleep at all, connection will break and we
+        // won't get result. If we sleep too much, tests will take significantly longer
+        // and Github Action run will error out
+        let sleep_duration = if delay_exit {
+            Duration::from_secs(30)
+        } else {
+            Duration::from_millis(300)
+        };
+
        // If launch failed, keep serving HTTP requests for a while, so the cloud
        // control plane can get the actual error.
        if delay_exit {
            info!("giving control plane 30s to collect the error before shutdown");
-            std::thread::sleep(Duration::from_secs(30));
        }
+        std::thread::sleep(sleep_duration);
        Ok(exit_code)
    }

@@ -866,20 +950,25 @@ impl ComputeNode {
        // Maybe sync safekeepers again, to speed up next startup
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-        if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
+        let lsn = if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
            info!("syncing safekeepers on shutdown");
            let storage_auth_token = pspec.storage_auth_token.clone();
            let lsn = self.sync_safekeepers(storage_auth_token)?;
-            info!("synced safekeepers at lsn {lsn}");
-        }
+            info!(%lsn, "synced safekeepers");
+            Some(lsn)
+        } else {
+            info!("not primary, not syncing safekeepers");
+            None
+        };

        let mut delay_exit = false;
        let mut state = self.state.lock().unwrap();
-        if state.status == ComputeStatus::TerminationPending {
+        state.terminate_flush_lsn = lsn;
+        if let ComputeStatus::TerminationPending { mode } = state.status {
            state.status = ComputeStatus::Terminated;
            self.state_changed.notify_all();
            // we were asked to terminate gracefully, don't exit to avoid restart
-            delay_exit = true
+            delay_exit = mode == compute_api::responses::TerminateMode::Fast
        }
        drop(state);

@@ -936,15 +1025,87 @@ impl ComputeNode {
        Ok(())
    }

-    // Get basebackup from the libpq connection to pageserver using `connstr` and
-    // unarchive it to `pgdata` directory overriding all its previous content.
+    /// Fetches a basebackup from the Pageserver using the compute state's Pageserver connstring and
+    /// unarchives it to `pgdata` directory, replacing any existing contents.
    #[instrument(skip_all, fields(%lsn))]
    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
-        let start_time = Instant::now();

-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
-        let mut config = postgres::Config::from_str(shard0_connstr)?;
+        let started = Instant::now();
+        let (connected, size) = if spec.pageserver_conninfo.prefer_grpc {
+            self.try_get_basebackup_grpc(spec, lsn)?
+        } else {
+            self.try_get_basebackup_libpq(spec, lsn)?
+        };
+
+        let mut state = self.state.lock().unwrap();
+        state.metrics.pageserver_connect_micros =
+            connected.duration_since(started).as_micros() as u64;
+        state.metrics.basebackup_bytes = size as u64;
+        state.metrics.basebackup_ms = started.elapsed().as_millis() as u64;
+
+        Ok(())
+    }
+
+    /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
+    /// the connection was established, and the (compressed) size of the basebackup.
+    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0");
+
+        let shard_index = match spec.pageserver_conninfo.shards.len() as u8 {
+            0 | 1 => ShardIndex::unsharded(),
+            count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
+        };
+
+        let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
+            let mut client = page_api::Client::connect(
+                shard0_url,
+                spec.tenant_id,
+                spec.timeline_id,
+                shard_index,
+                spec.storage_auth_token.clone(),
+                None, // NB: base backups use payload compression
+            )
+            .await?;
+            let connected = Instant::now();
+            let reader = client
+                .get_base_backup(page_api::GetBaseBackupRequest {
+                    lsn: (lsn != Lsn(0)).then_some(lsn),
+                    compression: BaseBackupCompression::Gzip,
+                    replica: spec.spec.mode != ComputeMode::Primary,
+                    full: false,
+                })
+                .await?;
+            anyhow::Ok((reader, connected))
+        })?;
+
+        let mut reader = MeasuredReader::new(tokio_util::io::SyncIoBridge::new(reader));
+
+        // Set `ignore_zeros` so that unpack() reads the entire stream and doesn't just stop at the
+        // end-of-archive marker. If the server errors, the tar::Builder drop handler will write an
+        // end-of-archive marker before the error is emitted, and we would not see the error.
+        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut reader));
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.params.pgdata)?;
+
+        Ok((connected, reader.get_byte_count()))
+    }
+
+    /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
+    /// when the connection was established, and the (compressed) size of the basebackup.
+    fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0");
+        let mut config = postgres::Config::from_str(&shard0_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
@@ -956,16 +1117,14 @@ impl ComputeNode {
        }

        config.application_name("compute_ctl");
-        if let Some(spec) = &compute_state.pspec {
-            config.options(&format!(
-                "-c neon.compute_mode={}",
-                spec.spec.mode.to_type_str()
-            ));
-        }
+        config.options(&format!(
+            "-c neon.compute_mode={}",
+            spec.spec.mode.to_type_str()
+        ));

        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
-        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
+        let connected = Instant::now();

        let basebackup_cmd = match lsn {
            Lsn(0) => {
@@ -1002,16 +1161,13 @@ impl ComputeNode {
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
        // doesn't stop at the end-of-archive marker. Otherwise, if the server
        // sends an Error after finishing the tarball, we will not notice it.
+        // The tar::Builder drop handler will write an end-of-archive marker
+        // before emitting the error, and we would not see it otherwise.
        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
        ar.set_ignore_zeros(true);
        ar.unpack(&self.params.pgdata)?;

-        // Report metrics
-        let mut state = self.state.lock().unwrap();
-        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
-        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
-        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
-        Ok(())
+        Ok((connected, measured_reader.get_byte_count()))
    }

    // Gets the basebackup in a retry loop
@@ -1035,10 +1191,7 @@ impl ComputeNode {
                    return result;
                }
                Err(ref e) if attempts < max_attempts => {
-                    warn!(
-                        "Failed to get basebackup: {} (attempt {}/{})",
-                        e, attempts, max_attempts
-                    );
+                    warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
                    retry_period_ms *= 1.5;
                }
@@ -1064,7 +1217,7 @@ impl ComputeNode {
        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
            // Format connstr
            let id = connstr.clone();
-            let connstr = format!("postgresql://no_user@{}", connstr);
+            let connstr = format!("postgresql://no_user@{connstr}");
            let options = format!(
                "-c timeline_id={} tenant_id={}",
                pspec.timeline_id, pspec.tenant_id
@@ -1247,16 +1400,8 @@ impl ComputeNode {
            }
        };

-        info!(
-            "getting basebackup@{} from pageserver {}",
-            lsn, &pspec.pageserver_connstr
-        );
-        self.get_basebackup(compute_state, lsn).with_context(|| {
-            format!(
-                "failed to get basebackup@{} from pageserver {}",
-                lsn, &pspec.pageserver_connstr
-            )
-        })?;
+        self.get_basebackup(compute_state, lsn)
+            .with_context(|| format!("failed to get basebackup@{lsn}"))?;

        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;
@@ -1427,7 +1572,7 @@ impl ComputeNode {
                let (mut client, connection) = conf.connect(NoTls).await?;
                tokio::spawn(async move {
                    if let Err(e) = connection.await {
-                        eprintln!("connection error: {}", e);
+                        eprintln!("connection error: {e}");
                    }
                });

@@ -1570,7 +1715,7 @@ impl ComputeNode {
                Ok((mut client, connection)) => {
                    tokio::spawn(async move {
                        if let Err(e) = connection.await {
-                            eprintln!("connection error: {}", e);
+                            eprintln!("connection error: {e}");
                        }
                    });
                    if let Err(e) = handle_migrations(&mut client).await {
@@ -1750,7 +1895,7 @@ impl ComputeNode {

                            // exit loop
                            ComputeStatus::Failed
-                            | ComputeStatus::TerminationPending
+                            | ComputeStatus::TerminationPending { .. }
                            | ComputeStatus::Terminated => break 'cert_update,

                            // wait
@@ -1874,7 +2019,7 @@ impl ComputeNode {
        let (client, connection) = connect_result.unwrap();
        tokio::spawn(async move {
            if let Err(e) = connection.await {
-                eprintln!("connection error: {}", e);
+                eprintln!("connection error: {e}");
            }
        });
        let result = client
@@ -1916,7 +2061,7 @@ LIMIT 100",
            self.params
                .remote_ext_base_url
                .as_ref()
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                .ok_or(DownloadError::BadInput(anyhow!(
                    "Remote extensions storage is not configured",
                )))?;

@@ -2043,7 +2188,7 @@ LIMIT 100",
            db_client
                .simple_query(&query)
                .await
-                .with_context(|| format!("Failed to execute query: {}", query))?;
+                .with_context(|| format!("Failed to execute query: {query}"))?;
        }

        Ok(())
@@ -2070,7 +2215,7 @@ LIMIT 100",
        let version: Option<ExtVersion> = db_client
            .query_opt(version_query, &[&ext_name])
            .await
-            .with_context(|| format!("Failed to execute query: {}", version_query))?
+            .with_context(|| format!("Failed to execute query: {version_query}"))?
            .map(|row| row.get(0));

        // sanitize the inputs as postgres idents.
@@ -2085,14 +2230,14 @@ LIMIT 100",
            db_client
                .simple_query(&query)
                .await
-                .with_context(|| format!("Failed to execute query: {}", query))?;
+                .with_context(|| format!("Failed to execute query: {query}"))?;
        } else {
            let query =
                format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
            db_client
                .simple_query(&query)
                .await
-                .with_context(|| format!("Failed to execute query: {}", query))?;
+                .with_context(|| format!("Failed to execute query: {query}"))?;
        }

        Ok(ext_version)
@@ -2112,7 +2257,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
+            .ok_or(anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -2191,22 +2336,22 @@ LIMIT 100",
    /// The operation will time out after a specified duration.
    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
        let state = self.state.lock().unwrap();
-        let old_pageserver_connstr = state
+        let old_pageserver_conninfo = state
            .pspec
            .as_ref()
            .expect("spec must be set")
-            .pageserver_connstr
+            .pageserver_conninfo
            .clone();
        let mut unchanged = true;
        let _ = self
            .state_changed
            .wait_timeout_while(state, duration, |s| {
-                let pageserver_connstr = &s
+                let pageserver_conninfo = &s
                    .pspec
                    .as_ref()
                    .expect("spec must be set")
-                    .pageserver_connstr;
-                unchanged = pageserver_connstr == &old_pageserver_connstr;
+                    .pageserver_conninfo;
+                unchanged = pageserver_conninfo == &old_pageserver_conninfo;
                unchanged
            })
            .unwrap();
@@ -2251,12 +2396,68 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
    Ok(())
 }

-pub fn forward_termination_signal() {
+pub fn forward_termination_signal(dev_mode: bool) {
    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
    if ss_pid != 0 {
        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
        kill(ss_pid, Signal::SIGTERM).ok();
    }
+
+    if !dev_mode {
+        //  Terminate pgbouncer with SIGKILL
+        match pid_file::read(PGBOUNCER_PIDFILE.into()) {
+            Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => {
+                info!("sending SIGKILL to pgbouncer process pid: {}", pid);
+                if let Err(e) = kill(pid, Signal::SIGKILL) {
+                    error!("failed to terminate pgbouncer: {}", e);
+                }
+            }
+            // pgbouncer does not lock the pid file, so we read and kill the process directly
+            Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => {
+                if let Ok(pid_str) = std::fs::read_to_string(PGBOUNCER_PIDFILE) {
+                    if let Ok(pid) = pid_str.trim().parse::<i32>() {
+                        info!(
+                            "sending SIGKILL to pgbouncer process pid: {} (from unlocked pid file)",
+                            pid
+                        );
+                        if let Err(e) = kill(Pid::from_raw(pid), Signal::SIGKILL) {
+                            error!("failed to terminate pgbouncer: {}", e);
+                        }
+                    }
+                } else {
+                    info!("pgbouncer pid file exists but process not running");
+                }
+            }
+            Ok(pid_file::PidFileRead::NotExist) => {
+                info!("pgbouncer pid file not found, process may not be running");
+            }
+            Err(e) => {
+                error!("error reading pgbouncer pid file: {}", e);
+            }
+        }
+
+        // Terminate local_proxy
+        match pid_file::read("/etc/local_proxy/pid".into()) {
+            Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => {
+                info!("sending SIGTERM to local_proxy process pid: {}", pid);
+                if let Err(e) = kill(pid, Signal::SIGTERM) {
+                    error!("failed to terminate local_proxy: {}", e);
+                }
+            }
+            Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => {
+                info!("local_proxy PID file exists but process not running");
+            }
+            Ok(pid_file::PidFileRead::NotExist) => {
+                info!("local_proxy PID file not found, process may not be running");
+            }
+            Err(e) => {
+                error!("error reading local_proxy PID file: {}", e);
+            }
+        }
+    } else {
+        info!("Skipping pgbouncer and local_proxy termination because in dev mode");
+    }
+
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
@@ -2289,3 +2490,21 @@ impl<T: 'static> JoinSetExt<T> for tokio::task::JoinSet<T> {
        })
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::fs::File;
+
+    use super::*;
+
+    #[test]
+    fn duplicate_safekeeper_connstring() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        match ParsedSpec::try_from(spec.clone()) {
+            Ok(_p) => panic!("Failed to detect duplicate entry"),
+            Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
+        };
+    }
+}
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -51,14 +51,56 @@ pub fn write_postgres_conf(

    // Write the postgresql.conf content from the spec file as is.
    if let Some(conf) = &spec.cluster.postgresql_conf {
-        writeln!(file, "{}", conf)?;
+        writeln!(file, "{conf}")?;
    }

    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
-    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
+
+    if let Some(conninfo) = &spec.pageserver_connection_info {
+        let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
+        let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
+
+        for shardno in 0..conninfo.shards.len() {
+            let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| {
+                anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map")
+            })?;
+
+            if let Some(url) = &info.libpq_url {
+                if let Some(ref mut urls) = libpq_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                libpq_urls = None
+            }
+            if let Some(url) = &info.grpc_url {
+                if let Some(ref mut urls) = grpc_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                grpc_urls = None
+            }
+        }
+        if let Some(libpq_urls) = libpq_urls {
+            writeln!(
+                file,
+                "neon.pageserver_connstring={}",
+                escape_conf_value(&libpq_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_connstring")?;
+        }
+        if let Some(grpc_urls) = grpc_urls {
+            writeln!(
+                file,
+                "neon.pageserver_grpc_urls={}",
+                escape_conf_value(&grpc_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_grpc_urls")?;
+        }
    }
+
    if let Some(stripe_size) = spec.shard_stripe_size {
        writeln!(file, "neon.stripe_size={stripe_size}")?;
    }
@@ -70,7 +112,7 @@ pub fn write_postgres_conf(
        );
        // If generation is given, prepend sk list with g#number:
        if let Some(generation) = spec.safekeepers_generation {
-            write!(neon_safekeepers_value, "g#{}:", generation)?;
+            write!(neon_safekeepers_value, "g#{generation}:")?;
        }
        neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
        writeln!(
@@ -109,8 +151,8 @@ pub fn write_postgres_conf(
        tls::update_key_path_blocking(pgdata_path, tls_config);

        // these are the default, but good to be explicit.
-        writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?;
-        writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?;
+        writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?;
+        writeln!(file, "ssl_key_file = '{SERVER_KEY}'")?;
    }

    // Locales
@@ -191,8 +233,7 @@ pub fn write_postgres_conf(
                }
                writeln!(
                    file,
-                    "shared_preload_libraries='{}{}'",
-                    libs, extra_shared_preload_libraries
+                    "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'"
                )?;
            } else {
                // Typically, this should be unreacheable,
@@ -244,8 +285,7 @@ pub fn write_postgres_conf(
                }
                writeln!(
                    file,
-                    "shared_preload_libraries='{}{}'",
-                    libs, extra_shared_preload_libraries
+                    "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'"
                )?;
            } else {
                // Typically, this should be unreacheable,
@@ -263,7 +303,7 @@ pub fn write_postgres_conf(
        }
    }

-    writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
+    writeln!(file, "neon.extension_server_port={extension_server_port}")?;

    if spec.drop_subscriptions_before_start {
        writeln!(file, "neon.disable_logical_replication_subscribers=true")?;
@@ -291,7 +331,7 @@ where
 {
    let path = pgdata_path.join("compute_ctl_temp_override.conf");
    let mut file = File::create(path)?;
-    write!(file, "{}", options)?;
+    write!(file, "{options}")?;

    let res = exec();

--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -74,9 +74,11 @@ More specifically, here is an example ext_index.json
 use std::path::Path;
 use std::str;

+use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 use anyhow::{Context, Result, bail};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
+use postgres_versioninfo::PgMajorVersion;
 use regex::Regex;
 use remote_storage::*;
 use reqwest::StatusCode;
@@ -86,8 +88,6 @@ use tracing::log::warn;
 use url::Url;
 use zstd::stream::read::Decoder;

-use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
-
 fn get_pg_config(argument: &str, pgbin: &str) -> String {
    // gives the result of `pg_config [argument]`
    // where argument is a flag like `--version` or `--sharedir`
@@ -106,7 +106,7 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {
        .to_string()
 }

-pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
+pub fn get_pg_version(pgbin: &str) -> PgMajorVersion {
    // pg_config --version returns a (platform specific) human readable string
    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
@@ -114,25 +114,11 @@ pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
 }

 pub fn get_pg_version_string(pgbin: &str) -> String {
-    match get_pg_version(pgbin) {
-        PostgresMajorVersion::V14 => "v14",
-        PostgresMajorVersion::V15 => "v15",
-        PostgresMajorVersion::V16 => "v16",
-        PostgresMajorVersion::V17 => "v17",
-    }
-    .to_owned()
+    get_pg_version(pgbin).v_str()
 }

-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum PostgresMajorVersion {
-    V14,
-    V15,
-    V16,
-    V17,
-}
-
-fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
-    use PostgresMajorVersion::*;
+fn parse_pg_version(human_version: &str) -> PgMajorVersion {
+    use PgMajorVersion::*;
    // Normal releases have version strings like "PostgreSQL 15.4". But there
    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
@@ -143,10 +129,10 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
        .captures(human_version)
    {
        Some(captures) if captures.len() == 2 => match &captures["major"] {
-            "14" => return V14,
-            "15" => return V15,
-            "16" => return V16,
-            "17" => return V17,
+            "14" => return PG14,
+            "15" => return PG15,
+            "16" => return PG16,
+            "17" => return PG17,
            _ => {}
        },
        _ => {}
@@ -310,10 +296,7 @@ async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Re
 async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
    let resp = reqwest::get(uri).await.map_err(|e| {
        (
-            format!(
-                "could not perform remote extensions server request: {:?}",
-                e
-            ),
+            format!("could not perform remote extensions server request: {e:?}"),
            UNKNOWN_HTTP_STATUS.to_string(),
        )
    })?;
@@ -323,7 +306,7 @@ async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)
        StatusCode::OK => match resp.bytes().await {
            Ok(resp) => Ok(resp),
            Err(e) => Err((
-                format!("could not read remote extensions server response: {:?}", e),
+                format!("could not read remote extensions server response: {e:?}"),
                // It's fine to return and report error with status as 200 OK,
                // because we still failed to read the response.
                status.to_string(),
@@ -334,10 +317,7 @@ async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)
            status.to_string(),
        )),
        _ => Err((
-            format!(
-                "unexpected remote extensions server response status code: {}",
-                status
-            ),
+            format!("unexpected remote extensions server response status code: {status}"),
            status.to_string(),
        )),
    }
@@ -349,25 +329,25 @@ mod tests {

    #[test]
    fn test_parse_pg_version() {
-        use super::PostgresMajorVersion::*;
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15);
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15);
+        use postgres_versioninfo::PgMajorVersion::*;
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), PG15);
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), PG15);
        assert_eq!(
            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            V15
+            PG15
        );

-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14);
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), PG14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), PG14);
        assert_eq!(
            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            V14
+            PG14
        );

-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16);
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16);
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16);
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), PG16);
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), PG16);
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), PG16);
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), PG16);
    }

    #[test]
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -65,7 +65,7 @@ pub(in crate::http) async fn configure(

            if state.status == ComputeStatus::Failed {
                let err = state.error.as_ref().map_or("unknown error", |x| x);
-                let msg = format!("compute configuration failed: {:?}", err);
+                let msg = format!("compute configuration failed: {err:?}");
                return Err(msg);
            }
        }
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -1,32 +1,42 @@
-use std::sync::Arc;
-
+use crate::compute::{ComputeNode, forward_termination_signal};
+use crate::http::JsonResponse;
 use axum::extract::State;
-use axum::response::{IntoResponse, Response};
-use compute_api::responses::ComputeStatus;
+use axum::response::Response;
+use axum_extra::extract::OptionalQuery;
+use compute_api::responses::{ComputeStatus, TerminateResponse};
 use http::StatusCode;
+use serde::Deserialize;
+use std::sync::Arc;
 use tokio::task;
 use tracing::info;

-use crate::compute::{ComputeNode, forward_termination_signal};
-use crate::http::JsonResponse;
+#[derive(Deserialize, Default)]
+pub struct TerminateQuery {
+    mode: compute_api::responses::TerminateMode,
+}

 /// Terminate the compute.
-pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
+pub(in crate::http) async fn terminate(
+    State(compute): State<Arc<ComputeNode>>,
+    OptionalQuery(terminate): OptionalQuery<TerminateQuery>,
+) -> Response {
+    let mode = terminate.unwrap_or_default().mode;
    {
        let mut state = compute.state.lock().unwrap();
        if state.status == ComputeStatus::Terminated {
-            return StatusCode::CREATED.into_response();
+            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
        }

        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
            return JsonResponse::invalid_status(state.status);
        }
-
-        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
-        drop(state);
+        state.set_status(
+            ComputeStatus::TerminationPending { mode },
+            &compute.state_changed,
+        );
    }

-    forward_termination_signal();
+    forward_termination_signal(false);
    info!("sent signal and notified waiters");

    // Spawn a blocking thread to wait for compute to become Terminated.
@@ -34,7 +44,7 @@ pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>)
    // be able to serve other requests while some particular request
    // is waiting for compute to finish configuration.
    let c = compute.clone();
-    task::spawn_blocking(move || {
+    let lsn = task::spawn_blocking(move || {
        let mut state = c.state.lock().unwrap();
        while state.status != ComputeStatus::Terminated {
            state = c.state_changed.wait(state).unwrap();
@@ -44,11 +54,10 @@ pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>)
                state.status
            );
        }
+        state.terminate_flush_lsn
    })
    .await
    .unwrap();
-
    info!("terminated Postgres");
-
-    StatusCode::OK.into_response()
+    JsonResponse::success(StatusCode::OK, TerminateResponse { lsn })
 }
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -43,7 +43,7 @@ pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExten
        let (mut client, connection) = conf.connect(NoTls).await?;
        tokio::spawn(async move {
            if let Err(e) = connection.await {
-                eprintln!("connection error: {}", e);
+                eprintln!("connection error: {e}");
            }
        });

@@ -57,7 +57,7 @@ pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExten
        let (client, connection) = conf.connect(NoTls).await?;
        tokio::spawn(async move {
            if let Err(e) = connection.await {
-                eprintln!("connection error: {}", e);
+                eprintln!("connection error: {e}");
            }
        });

--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -22,6 +22,7 @@ mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
+pub mod pgbouncer;
 pub mod rsyslog;
 pub mod spec;
 mod spec_apply;
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -4,7 +4,8 @@ use std::thread;
 use std::time::{Duration, SystemTime};

 use anyhow::{Result, bail};
-use compute_api::spec::ComputeMode;
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo};
+use pageserver_page_api as page_api;
 use postgres::{NoTls, SimpleQueryMessage};
 use tracing::{info, warn};
 use utils::id::{TenantId, TimelineId};
@@ -76,25 +77,16 @@ fn acquire_lsn_lease_with_retry(

    loop {
        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let configs = {
+        let (conninfo, auth) = {
            let state = compute.state.lock().unwrap();
-
            let spec = state.pspec.as_ref().expect("spec must be set");
-
-            let conn_strings = spec.pageserver_connstr.split(',');
-
-            conn_strings
-                .map(|connstr| {
-                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
-                    if let Some(storage_auth_token) = &spec.storage_auth_token {
-                        config.password(storage_auth_token.clone());
-                    }
-                    config
-                })
-                .collect::<Vec<_>>()
+            (
+                spec.pageserver_conninfo.clone(),
+                spec.storage_auth_token.clone(),
+            )
        };

-        let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
+        let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
        match result {
            Ok(Some(res)) => {
                return Ok(res);
@@ -116,68 +108,112 @@ fn acquire_lsn_lease_with_retry(
    }
 }

-/// Tries to acquire an LSN lease through PS page_service API.
+/// Tries to acquire LSN leases on all Pageserver shards.
 fn try_acquire_lsn_lease(
+    conninfo: PageserverConnectionInfo,
+    auth: Option<&str>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
-    configs: &[postgres::Config],
 ) -> Result<Option<SystemTime>> {
-    fn get_valid_until(
-        config: &postgres::Config,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-    ) -> Result<Option<SystemTime>> {
-        let mut client = config.connect(NoTls)?;
-        let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
-        let res = client.simple_query(&cmd)?;
-        let msg = match res.first() {
-            Some(msg) => msg,
-            None => bail!("empty response"),
-        };
-        let row = match msg {
-            SimpleQueryMessage::Row(row) => row,
-            _ => bail!("error parsing lsn lease response"),
+    let shard_count = conninfo.shards.len();
+    let mut leases = Vec::new();
+
+    for (shard_number, shard) in conninfo.shards.into_iter() {
+        let tenant_shard_id = match shard_count {
+            0 | 1 => TenantShardId::unsharded(tenant_id),
+            shard_count => TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(shard_number as u8),
+                shard_count: ShardCount::new(shard_count as u8),
+            },
        };

-        // Note: this will be None if a lease is explicitly not granted.
-        let valid_until_str = row.get("valid_until");
-
-        let valid_until = valid_until_str.map(|s| {
-            SystemTime::UNIX_EPOCH
-                .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
-                .expect("Time larger than max SystemTime could handle")
-        });
-        Ok(valid_until)
+        let lease = if conninfo.prefer_grpc {
+            acquire_lsn_lease_grpc(
+                &shard.grpc_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
+        } else {
+            acquire_lsn_lease_libpq(
+                &shard.libpq_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
+        };
+        leases.push(lease);
    }

-    let shard_count = configs.len();
+    Ok(leases.into_iter().min().flatten())
+}

-    let valid_until = if shard_count > 1 {
-        configs
-            .iter()
-            .enumerate()
-            .map(|(shard_number, config)| {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount::new(shard_count as u8),
-                    shard_number: ShardNumber(shard_number as u8),
-                };
-                get_valid_until(config, tenant_shard_id, timeline_id, lsn)
-            })
-            .collect::<Result<Vec<Option<SystemTime>>>>()?
-            .into_iter()
-            .min()
-            .unwrap()
-    } else {
-        get_valid_until(
-            &configs[0],
-            TenantShardId::unsharded(tenant_id),
-            timeline_id,
-            lsn,
-        )?
+/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a
+/// postgresql:// scheme.
+fn acquire_lsn_lease_libpq(
+    connstring: &str,
+    auth: Option<&str>,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<Option<SystemTime>> {
+    let mut config = postgres::Config::from_str(connstring)?;
+    if let Some(auth) = auth {
+        config.password(auth);
+    }
+    let mut client = config.connect(NoTls)?;
+    let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
+    let res = client.simple_query(&cmd)?;
+    let msg = match res.first() {
+        Some(msg) => msg,
+        None => bail!("empty response"),
+    };
+    let row = match msg {
+        SimpleQueryMessage::Row(row) => row,
+        _ => bail!("error parsing lsn lease response"),
    };

+    // Note: this will be None if a lease is explicitly not granted.
+    let valid_until_str = row.get("valid_until");
+
+    let valid_until = valid_until_str.map(|s| {
+        SystemTime::UNIX_EPOCH
+            .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
+            .expect("Time larger than max SystemTime could handle")
+    });
    Ok(valid_until)
 }
+
+/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a
+/// grpc:// scheme.
+fn acquire_lsn_lease_grpc(
+    connstring: &str,
+    auth: Option<&str>,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<Option<SystemTime>> {
+    tokio::runtime::Handle::current().block_on(async move {
+        let mut client = page_api::Client::connect(
+            connstring.to_string(),
+            tenant_shard_id.tenant_id,
+            timeline_id,
+            tenant_shard_id.to_index(),
+            auth.map(String::from),
+            None,
+        )
+        .await?;
+
+        let req = page_api::LeaseLsnRequest { lsn };
+        match client.lease_lsn(req).await {
+            Ok(expires) => Ok(Some(expires)),
+            // Lease couldn't be acquired because the LSN has been garbage collected.
+            Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None),
+            Err(err) => Err(err.into()),
+        }
+    })
+}
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -83,7 +83,9 @@ impl ComputeMonitor {
        let compute_status = self.compute.get_status();
        if matches!(
            compute_status,
-            ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
+            ComputeStatus::Terminated
+                | ComputeStatus::TerminationPending { .. }
+                | ComputeStatus::Failed
        ) {
            info!(
                "compute is in {} status, stopping compute monitor",
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -36,9 +36,9 @@ pub fn escape_literal(s: &str) -> String {
    let res = s.replace('\'', "''").replace('\\', "\\\\");

    if res.contains('\\') {
-        format!("E'{}'", res)
+        format!("E'{res}'")
    } else {
-        format!("'{}'", res)
+        format!("'{res}'")
    }
 }

@@ -46,7 +46,7 @@ pub fn escape_literal(s: &str) -> String {
 /// with `'{}'` is not required, as it returns a ready-to-use config string.
 pub fn escape_conf_value(s: &str) -> String {
    let res = s.replace('\'', "''").replace('\\', "\\\\");
-    format!("'{}'", res)
+    format!("'{res}'")
 }

 pub trait GenericOptionExt {
@@ -446,7 +446,7 @@ pub async fn tune_pgbouncer(
        let mut pgbouncer_connstr =
            "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
        if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
-            pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
+            pgbouncer_connstr.push_str(format!(" password={pass}").as_str());
        }
        pgbouncer_connstr
    };
@@ -464,7 +464,7 @@ pub async fn tune_pgbouncer(
            Ok((client, connection)) => {
                tokio::spawn(async move {
                    if let Err(e) = connection.await {
-                        eprintln!("connection error: {}", e);
+                        eprintln!("connection error: {e}");
                    }
                });
                break client;
--- a/compute_tools/src/pgbouncer.rs
+++ b/compute_tools/src/pgbouncer.rs
@@ -0,0 +1 @@
+pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid";
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -23,12 +23,12 @@ fn do_control_plane_request(
 ) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
    let resp = reqwest::blocking::Client::new()
        .get(uri)
-        .header("Authorization", format!("Bearer {}", jwt))
+        .header("Authorization", format!("Bearer {jwt}"))
        .send()
        .map_err(|e| {
            (
                true,
-                format!("could not perform request to control plane: {:?}", e),
+                format!("could not perform request to control plane: {e:?}"),
                UNKNOWN_HTTP_STATUS.to_string(),
            )
        })?;
@@ -39,7 +39,7 @@ fn do_control_plane_request(
            Ok(spec_resp) => Ok(spec_resp),
            Err(e) => Err((
                true,
-                format!("could not deserialize control plane response: {:?}", e),
+                format!("could not deserialize control plane response: {e:?}"),
                status.to_string(),
            )),
        },
@@ -62,7 +62,7 @@ fn do_control_plane_request(
        // or some internal failure happened. Doesn't make much sense to retry in this case.
        _ => Err((
            false,
-            format!("unexpected control plane response status code: {}", status),
+            format!("unexpected control plane response status code: {status}"),
            status.to_string(),
        )),
    }
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -933,56 +933,53 @@ async fn get_operations<'a>(
                PerDatabasePhase::DeleteDBRoleReferences => {
                    let ctx = ctx.read().await;

-                    let operations =
-                        spec.delta_operations
-                            .iter()
-                            .flatten()
-                            .filter(|op| op.action == "delete_role")
-                            .filter_map(move |op| {
-                                if db.is_owned_by(&op.name) {
-                                    return None;
-                                }
-                                if !ctx.roles.contains_key(&op.name) {
-                                    return None;
-                                }
-                                let quoted = op.name.pg_quote();
-                                let new_owner = match &db {
-                                    DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
-                                    DB::UserDB(db) => db.owner.pg_quote(),
-                                };
-                                let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
+                    let operations = spec
+                        .delta_operations
+                        .iter()
+                        .flatten()
+                        .filter(|op| op.action == "delete_role")
+                        .filter_map(move |op| {
+                            if db.is_owned_by(&op.name) {
+                                return None;
+                            }
+                            if !ctx.roles.contains_key(&op.name) {
+                                return None;
+                            }
+                            let quoted = op.name.pg_quote();
+                            let new_owner = match &db {
+                                DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
+                                DB::UserDB(db) => db.owner.pg_quote(),
+                            };
+                            let (escaped_role, outer_tag) = op.name.pg_quote_dollar();

-                                Some(vec![
-                                    // This will reassign all dependent objects to the db owner
-                                    Operation {
-                                        query: format!(
-                                            "REASSIGN OWNED BY {} TO {}",
-                                            quoted, new_owner,
-                                        ),
-                                        comment: None,
-                                    },
-                                    // Revoke some potentially blocking privileges (Neon-specific currently)
-                                    Operation {
-                                        query: format!(
-                                            include_str!("sql/pre_drop_role_revoke_privileges.sql"),
-                                            // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
-                                            role_name = escaped_role,
-                                            outer_tag = outer_tag,
-                                        ),
-                                        comment: None,
-                                    },
-                                    // This now will only drop privileges of the role
-                                    // TODO: this is obviously not 100% true because of the above case,
-                                    // there could be still some privileges that are not revoked. Maybe this
-                                    // only drops privileges that were granted *by this* role, not *to this* role,
-                                    // but this has to be checked.
-                                    Operation {
-                                        query: format!("DROP OWNED BY {}", quoted),
-                                        comment: None,
-                                    },
-                                ])
-                            })
-                            .flatten();
+                            Some(vec![
+                                // This will reassign all dependent objects to the db owner
+                                Operation {
+                                    query: format!("REASSIGN OWNED BY {quoted} TO {new_owner}",),
+                                    comment: None,
+                                },
+                                // Revoke some potentially blocking privileges (Neon-specific currently)
+                                Operation {
+                                    query: format!(
+                                        include_str!("sql/pre_drop_role_revoke_privileges.sql"),
+                                        // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
+                                        role_name = escaped_role,
+                                        outer_tag = outer_tag,
+                                    ),
+                                    comment: None,
+                                },
+                                // This now will only drop privileges of the role
+                                // TODO: this is obviously not 100% true because of the above case,
+                                // there could be still some privileges that are not revoked. Maybe this
+                                // only drops privileges that were granted *by this* role, not *to this* role,
+                                // but this has to be checked.
+                                Operation {
+                                    query: format!("DROP OWNED BY {quoted}"),
+                                    comment: None,
+                                },
+                            ])
+                        })
+                        .flatten();

                    Ok(Box::new(operations))
                }
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -27,7 +27,7 @@ pub async fn ping_safekeeper(
    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
    tokio::spawn(async move {
        if let Err(e) = conn.await {
-            eprintln!("connection error: {}", e);
+            eprintln!("connection error: {e}");
        }
    });

--- a/compute_tools/tests/README.md
+++ b/compute_tools/tests/README.md
@@ -0,0 +1,6 @@
+### Test files
+
+The file `cluster_spec.json` has been copied over from libs/compute_api
+tests, with some edits:
+
+  - the neon.safekeepers setting contains a duplicate value
--- a/compute_tools/tests/cluster_spec.json
+++ b/compute_tools/tests/cluster_spec.json
@@ -0,0 +1,245 @@
+{
+  "format_version": 1.0,
+
+  "timestamp": "2021-05-23T18:25:43.511Z",
+  "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
+
+  "cluster": {
+    "cluster_id": "test-cluster-42",
+    "name": "Zenith Test",
+    "state": "restarted",
+    "roles": [
+      {
+        "name": "postgres",
+        "encrypted_password": "6b1d16b78004bbd51fa06af9eda75972",
+        "options": null
+      },
+      {
+        "name": "alexk",
+        "encrypted_password": null,
+        "options": null
+      },
+      {
+        "name": "zenith \"new\"",
+        "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972",
+        "options": null
+      },
+      {
+        "name": "zen",
+        "encrypted_password": "9b1d16b78004bbd51fa06af9eda75972"
+      },
+      {
+        "name": "\"name\";\\n select 1;",
+        "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+      },
+      {
+        "name": "MyRole",
+        "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+      }
+    ],
+    "databases": [
+      {
+        "name": "DB2",
+        "owner": "alexk",
+        "options": [
+          {
+            "name": "LC_COLLATE",
+            "value": "C",
+            "vartype": "string"
+          },
+          {
+            "name": "LC_CTYPE",
+            "value": "C",
+            "vartype": "string"
+          },
+          {
+            "name": "TEMPLATE",
+            "value": "template0",
+            "vartype": "enum"
+          }
+        ]
+      },
+      {
+        "name": "zenith",
+        "owner": "MyRole"
+      },
+      {
+        "name": "zen",
+        "owner": "zen"
+      }
+    ],
+    "settings": [
+      {
+        "name": "fsync",
+        "value": "off",
+        "vartype": "bool"
+      },
+      {
+        "name": "wal_level",
+        "value": "logical",
+        "vartype": "enum"
+      },
+      {
+        "name": "hot_standby",
+        "value": "on",
+        "vartype": "bool"
+      },
+      {
+        "name": "prewarm_lfc_on_startup",
+        "value": "off",
+        "vartype": "bool"
+      },
+      {
+        "name": "neon.safekeepers",
+        "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501,127.0.0.1:6502",
+        "vartype": "string"
+      },
+      {
+        "name": "wal_log_hints",
+        "value": "on",
+        "vartype": "bool"
+      },
+      {
+        "name": "log_connections",
+        "value": "on",
+        "vartype": "bool"
+      },
+      {
+        "name": "shared_buffers",
+        "value": "32768",
+        "vartype": "integer"
+      },
+      {
+        "name": "port",
+        "value": "55432",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_connections",
+        "value": "100",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_wal_senders",
+        "value": "10",
+        "vartype": "integer"
+      },
+      {
+        "name": "listen_addresses",
+        "value": "0.0.0.0",
+        "vartype": "string"
+      },
+      {
+        "name": "wal_sender_timeout",
+        "value": "0",
+        "vartype": "integer"
+      },
+      {
+        "name": "password_encryption",
+        "value": "md5",
+        "vartype": "enum"
+      },
+      {
+        "name": "maintenance_work_mem",
+        "value": "65536",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_parallel_workers",
+        "value": "8",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_worker_processes",
+        "value": "8",
+        "vartype": "integer"
+      },
+      {
+        "name": "neon.tenant_id",
+        "value": "b0554b632bd4d547a63b86c3630317e8",
+        "vartype": "string"
+      },
+      {
+        "name": "max_replication_slots",
+        "value": "10",
+        "vartype": "integer"
+      },
+      {
+        "name": "neon.timeline_id",
+        "value": "2414a61ffc94e428f14b5758fe308e13",
+        "vartype": "string"
+      },
+      {
+        "name": "shared_preload_libraries",
+        "value": "neon",
+        "vartype": "string"
+      },
+      {
+        "name": "synchronous_standby_names",
+        "value": "walproposer",
+        "vartype": "string"
+      },
+      {
+        "name": "neon.pageserver_connstring",
+        "value": "host=127.0.0.1 port=6400",
+        "vartype": "string"
+      },
+      {
+        "name": "test.escaping",
+        "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray",
+        "vartype": "string"
+      }
+    ]
+  },
+  "delta_operations": [
+    {
+      "action": "delete_db",
+      "name": "zenith_test"
+    },
+    {
+      "action": "rename_db",
+      "name": "DB",
+      "new_name": "DB2"
+    },
+    {
+      "action": "delete_role",
+      "name": "zenith2"
+    },
+    {
+      "action": "rename_role",
+      "name": "zenith new",
+      "new_name": "zenith \"new\""
+    }
+  ],
+  "remote_extensions": {
+    "library_index": {
+      "postgis-3": "postgis",
+      "libpgrouting-3.4": "postgis",
+      "postgis_raster-3": "postgis",
+      "postgis_sfcgal-3": "postgis",
+      "postgis_topology-3": "postgis",
+      "address_standardizer-3": "postgis"
+    },
+    "extension_data": {
+      "postgis": {
+        "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
+        "control_data": {
+          "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
+          "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
+          "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
+          "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
+          "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
+          "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
+          "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
+          "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
+        }
+      }
+    },
+    "custom_extensions": [],
+    "public_extensions": ["postgis"]
+  },
+  "pgbouncer_settings": {
+    "default_pool_size": "42",
+    "pool_mode": "session"
+  }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -16,9 +16,9 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::requests::ComputeClaimsScope;
-use compute_api::spec::ComputeMode;
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo};
 use control_plane::broker::StorageBroker;
-use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
 use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
 use control_plane::local_env;
 use control_plane::local_env::{
@@ -48,7 +48,7 @@ use postgres_connection::parse_host_port;
 use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, PgMajorVersion, PgVersionId,
 };
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use tokio::task::JoinSet;
@@ -64,7 +64,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: u32 = 17;
+const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -169,7 +169,7 @@ struct TenantCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version to use for the initial timeline")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,

    #[clap(
        long,
@@ -292,7 +292,7 @@ struct TimelineCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,
 }

 #[derive(clap::Args)]
@@ -324,7 +324,7 @@ struct TimelineImportCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version of the backup being imported")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,
 }

 #[derive(clap::Subcommand)]
@@ -603,7 +603,15 @@ struct EndpointCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version")]
-    pg_version: u32,
+    pg_version: PgMajorVersion,
+
+    /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
+    ///
+    /// Specified on creation such that it's retained across reconfiguration and restarts.
+    ///
+    /// NB: not yet supported by computes.
+    #[clap(long)]
+    grpc: bool,

    #[clap(
        long,
@@ -664,6 +672,13 @@ struct EndpointStartCmdArgs {
    #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
    #[arg(default_value = "90s")]
    start_timeout: Duration,
+
+    #[clap(
+        long,
+        help = "Run in development mode, skipping VM-specific operations like process termination",
+        action = clap::ArgAction::SetTrue
+    )]
+    dev: bool,
 }

 #[derive(clap::Args)]
@@ -696,10 +711,9 @@ struct EndpointStopCmdArgs {
    )]
    destroy: bool,

-    #[clap(long, help = "Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")]
-    #[arg(value_parser(["smart", "fast", "immediate"]))]
-    #[arg(default_value = "fast")]
-    mode: String,
+    #[clap(long, help = "Postgres shutdown mode")]
+    #[clap(default_value = "fast")]
+    mode: EndpointTerminateMode,
 }

 #[derive(clap::Args)]
@@ -905,7 +919,7 @@ fn print_timeline(
            br_sym = "┗━";
        }

-        print!("{} @{}: ", br_sym, ancestor_lsn);
+        print!("{br_sym} @{ancestor_lsn}: ");
    }

    // Finally print a timeline id and name with new line
@@ -1281,7 +1295,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
                    },
                    new_members: None,
                };
-                let pg_version = args.pg_version * 10000;
+                let pg_version = PgVersionId::from(args.pg_version);
                let req = safekeeper_api::models::TimelineCreateRequest {
                    tenant_id,
                    timeline_id,
@@ -1451,6 +1465,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                args.internal_http_port,
                args.pg_version,
                mode,
+                args.grpc,
                !args.update_catalog,
                false,
            )?;
@@ -1489,22 +1504,35 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                )?;
            }

-            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
-                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
-                let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
-                (
-                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
-                    // If caller is telling us what pageserver to use, this is not a tenant which is
-                    // full managed by storage controller, therefore not sharded.
-                    DEFAULT_STRIPE_SIZE,
-                )
+            let (shards, stripe_size) = if let Some(ps_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(ps_id).unwrap();
+                let libpq_url = Some({
+                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
+                    let port = port.unwrap_or(5432);
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
+                };
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+
+                // If caller is telling us what pageserver to use, this is not a tenant which is
+                // fully managed by storage controller, therefore not sharded.
+                (vec![(0, pageserver)], DEFAULT_STRIPE_SIZE)
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
                let storage_controller = StorageController::from_env(env);
                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
-                let pageservers = futures::future::try_join_all(
-                    locate_result.shards.into_iter().map(|shard| async move {
+                let shards = futures::future::try_join_all(locate_result.shards.into_iter().map(
+                    |shard| async move {
                        if let ComputeMode::Static(lsn) = endpoint.mode {
                            // Initialize LSN leases for static computes.
                            let conf = env.get_pageserver_conf(shard.node_id).unwrap();
@@ -1516,19 +1544,34 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                                .await?;
                        }

-                        anyhow::Ok((
-                            Host::parse(&shard.listen_pg_addr)
-                                .expect("Storage controller reported bad hostname"),
-                            shard.listen_pg_port,
-                        ))
-                    }),
-                )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr)?;
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
+                        } else {
+                            None
+                        };
+                        let pageserver = PageserverShardConnectionInfo {
+                            libpq_url,
+                            grpc_url,
+                        };
+                        anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
+                    },
+                ))
                .await?;
                let stripe_size = locate_result.shard_params.stripe_size;

-                (pageservers, stripe_size)
+                (shards, stripe_size)
+            };
+            assert!(!shards.is_empty());
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
            };
-            assert!(!pageservers.is_empty());

            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1560,11 +1603,12 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    endpoint_storage_addr,
                    safekeepers_generation,
                    safekeepers,
-                    pageservers,
+                    pageserver_conninfo,
                    remote_ext_base_url.as_ref(),
                    stripe_size.0 as usize,
                    args.create_test_user,
                    args.start_timeout,
+                    args.dev,
                )
                .await?;
        }
@@ -1574,12 +1618,27 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
-                let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
-                vec![(
-                    pageserver.pg_connection_config.host().clone(),
-                    pageserver.pg_connection_config.port(),
-                )]
+            let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
+                let conf = env.get_pageserver_conf(ps_id)?;
+                let libpq_url = Some({
+                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
+                    let port = port.unwrap_or(5432);
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
+                };
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+                // If caller is telling us what pageserver to use, this is not a tenant which is
+                // fully managed by storage controller, therefore not sharded.
+                vec![(0, pageserver)]
            } else {
                let storage_controller = StorageController::from_env(env);
                storage_controller
@@ -1588,18 +1647,38 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    .shards
                    .into_iter()
                    .map(|shard| {
+                        // Use gRPC if requested.
+                        let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname");
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
+                        } else {
+                            None
+                        };
                        (
-                            Host::parse(&shard.listen_pg_addr)
-                                .expect("Storage controller reported malformed host"),
-                            shard.listen_pg_port,
+                            shard.shard_id.shard_number.0 as u32,
+                            PageserverShardConnectionInfo {
+                                libpq_url,
+                                grpc_url,
+                            },
                        )
                    })
                    .collect::<Vec<_>>()
            };
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
+            };
            // If --safekeepers argument is given, use only the listed
            // safekeeper nodes; otherwise all from the env.
            let safekeepers = parse_safekeepers(&args.safekeepers)?;
-            endpoint.reconfigure(pageservers, None, safekeepers).await?;
+            endpoint
+                .reconfigure(Some(pageserver_conninfo), None, safekeepers, None)
+                .await?;
        }
        EndpointCmd::Stop(args) => {
            let endpoint_id = &args.endpoint_id;
@@ -1607,7 +1686,10 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .endpoints
                .get(endpoint_id)
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            endpoint.stop(&args.mode, args.destroy)?;
+            match endpoint.stop(args.mode, args.destroy).await?.lsn {
+                Some(lsn) => println!("{lsn}"),
+                None => println!("null"),
+            }
        }
        EndpointCmd::GenerateJwt(args) => {
            let endpoint = {
@@ -1689,7 +1771,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) ->
                StopMode::Immediate => true,
            };
            if let Err(e) = get_pageserver(env, args.pageserver_id)?.stop(immediate) {
-                eprintln!("pageserver stop failed: {}", e);
+                eprintln!("pageserver stop failed: {e}");
                exit(1);
            }
        }
@@ -1698,7 +1780,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) ->
            let pageserver = get_pageserver(env, args.pageserver_id)?;
            //TODO what shutdown strategy should we use here?
            if let Err(e) = pageserver.stop(false) {
-                eprintln!("pageserver stop failed: {}", e);
+                eprintln!("pageserver stop failed: {e}");
                exit(1);
            }

@@ -1715,7 +1797,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) ->
            {
                Ok(_) => println!("Page server is up and running"),
                Err(err) => {
-                    eprintln!("Page server is not available: {}", err);
+                    eprintln!("Page server is not available: {err}");
                    exit(1);
                }
            }
@@ -1752,7 +1834,7 @@ async fn handle_storage_controller(
                },
            };
            if let Err(e) = svc.stop(stop_args).await {
-                eprintln!("stop failed: {}", e);
+                eprintln!("stop failed: {e}");
                exit(1);
            }
        }
@@ -1774,7 +1856,7 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
            let safekeeper = get_safekeeper(env, args.id)?;

            if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await {
-                eprintln!("safekeeper start failed: {}", e);
+                eprintln!("safekeeper start failed: {e}");
                exit(1);
            }
        }
@@ -1786,7 +1868,7 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
                StopMode::Immediate => true,
            };
            if let Err(e) = safekeeper.stop(immediate) {
-                eprintln!("safekeeper stop failed: {}", e);
+                eprintln!("safekeeper stop failed: {e}");
                exit(1);
            }
        }
@@ -1799,12 +1881,12 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
            };

            if let Err(e) = safekeeper.stop(immediate) {
-                eprintln!("safekeeper stop failed: {}", e);
+                eprintln!("safekeeper stop failed: {e}");
                exit(1);
            }

            if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await {
-                eprintln!("safekeeper start failed: {}", e);
+                eprintln!("safekeeper start failed: {e}");
                exit(1);
            }
        }
@@ -2039,11 +2121,16 @@ async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Resul
 }

 async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+    let mode = if immediate {
+        EndpointTerminateMode::Immediate
+    } else {
+        EndpointTerminateMode::Fast
+    };
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
+                if let Err(e) = node.stop(mode, false).await {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -2055,7 +2142,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {

    let storage = EndpointStorage::from_env(env);
    if let Err(e) = storage.stop(immediate) {
-        eprintln!("endpoint_storage stop failed: {:#}", e);
+        eprintln!("endpoint_storage stop failed: {e:#}");
    }

    for ps_conf in &env.pageservers {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -37,6 +37,7 @@
 //! ```
 //!
 use std::collections::BTreeMap;
+use std::fmt::Display;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
 use std::path::PathBuf;
 use std::process::Command;
@@ -51,12 +52,17 @@ use compute_api::requests::{
    COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest,
 };
 use compute_api::responses::{
-    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse,
+    TlsConfig,
 };
 use compute_api::spec::{
    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
    RemoteExtSpec, Role,
 };
+
+// re-export these, because they're used in the reconfigure() function
+pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
+
 use jsonwebtoken::jwk::{
    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -65,18 +71,17 @@ use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use pem::Pem;
 use reqwest::header::CONTENT_TYPE;
+use safekeeper_api::PgMajorVersion;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 use spki::der::Decode;
 use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
-use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
-use crate::storage_controller::StorageController;

 // contents of a endpoint.json file
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -88,7 +93,8 @@ pub struct EndpointConf {
    pg_port: u16,
    external_http_port: u16,
    internal_http_port: u16,
-    pg_version: u32,
+    pg_version: PgMajorVersion,
+    grpc: bool,
    skip_pg_catalog_updates: bool,
    reconfigure_concurrency: usize,
    drop_subscriptions_before_start: bool,
@@ -190,8 +196,9 @@ impl ComputeControlPlane {
        pg_port: Option<u16>,
        external_http_port: Option<u16>,
        internal_http_port: Option<u16>,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
        mode: ComputeMode,
+        grpc: bool,
        skip_pg_catalog_updates: bool,
        drop_subscriptions_before_start: bool,
    ) -> Result<Arc<Endpoint>> {
@@ -226,6 +233,7 @@ impl ComputeControlPlane {
            // we also skip catalog updates in the cloud.
            skip_pg_catalog_updates,
            drop_subscriptions_before_start,
+            grpc,
            reconfigure_concurrency: 1,
            features: vec![],
            cluster: None,
@@ -244,6 +252,7 @@ impl ComputeControlPlane {
                internal_http_port,
                pg_port,
                pg_version,
+                grpc,
                skip_pg_catalog_updates,
                drop_subscriptions_before_start,
                reconfigure_concurrency: 1,
@@ -298,6 +307,8 @@ pub struct Endpoint {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mode: ComputeMode,
+    /// If true, the endpoint should use gRPC to communicate with Pageservers.
+    pub grpc: bool,

    // port and address of the Postgres server and `compute_ctl`'s HTTP APIs
    pub pg_address: SocketAddr,
@@ -305,7 +316,7 @@ pub struct Endpoint {
    pub internal_http_address: SocketAddr,

    // postgres major version in the format: 14, 15, etc.
-    pg_version: u32,
+    pg_version: PgMajorVersion,

    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
@@ -333,15 +344,35 @@ pub enum EndpointStatus {
    RunningNoPidfile,
 }

-impl std::fmt::Display for EndpointStatus {
+impl Display for EndpointStatus {
    fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let s = match self {
+        writer.write_str(match self {
            Self::Running => "running",
            Self::Stopped => "stopped",
            Self::Crashed => "crashed",
            Self::RunningNoPidfile => "running, no pidfile",
-        };
-        write!(writer, "{}", s)
+        })
+    }
+}
+
+#[derive(Default, Clone, Copy, clap::ValueEnum)]
+pub enum EndpointTerminateMode {
+    #[default]
+    /// Use pg_ctl stop -m fast
+    Fast,
+    /// Use pg_ctl stop -m immediate
+    Immediate,
+    /// Use /terminate?mode=immediate
+    ImmediateTerminate,
+}
+
+impl std::fmt::Display for EndpointTerminateMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(match &self {
+            EndpointTerminateMode::Fast => "fast",
+            EndpointTerminateMode::Immediate => "immediate",
+            EndpointTerminateMode::ImmediateTerminate => "immediate-terminate",
+        })
    }
 }

@@ -380,6 +411,7 @@ impl Endpoint {
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
+            grpc: conf.grpc,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
            reconfigure_concurrency: conf.reconfigure_concurrency,
            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
@@ -506,7 +538,7 @@ impl Endpoint {
                conf.append("hot_standby", "on");
                // prefetching of blocks referenced in WAL doesn't make sense for us
                // Neon hot standby ignores pages that are not in the shared_buffers
-                if self.pg_version >= 15 {
+                if self.pg_version >= PgMajorVersion::PG15 {
                    conf.append("recovery_prefetch", "off");
                }
            }
@@ -608,14 +640,6 @@ impl Endpoint {
        }
    }

-    fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
-        pageservers
-            .iter()
-            .map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
-            .collect::<Vec<_>>()
-            .join(",")
-    }
-
    /// Map safekeepers ids to the actual connection strings.
    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
        let mut safekeeper_connstrings = Vec::new();
@@ -656,11 +680,12 @@ impl Endpoint {
        endpoint_storage_addr: String,
        safekeepers_generation: Option<SafekeeperGeneration>,
        safekeepers: Vec<NodeId>,
-        pageservers: Vec<(Host, u16)>,
+        pageserver_conninfo: PageserverConnectionInfo,
        remote_ext_base_url: Option<&String>,
        shard_stripe_size: usize,
        create_test_user: bool,
        start_timeout: Duration,
+        dev: bool,
    ) -> Result<()> {
        if self.status() == EndpointStatus::Running {
            anyhow::bail!("The endpoint is already running");
@@ -674,9 +699,6 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
-        assert!(!pageserver_connstring.is_empty());
-
        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;

        // check for file remote_extensions_spec.json
@@ -735,7 +757,7 @@ impl Endpoint {
                branch_id: None,
                endpoint_id: Some(self.endpoint_id.clone()),
                mode: self.mode,
-                pageserver_connstring: Some(pageserver_connstring),
+                pageserver_connection_info: Some(pageserver_conninfo),
                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
                safekeeper_connstrings,
                storage_auth_token: auth_token.clone(),
@@ -794,10 +816,10 @@ impl Endpoint {

        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
-        println!("Starting postgres node at '{}'", conn_str);
+        println!("Starting postgres node at '{conn_str}'");
        if create_test_user {
            let conn_str = self.connstr("test", "neondb");
-            println!("Also at '{}'", conn_str);
+            println!("Also at '{conn_str}'");
        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
        cmd.args([
@@ -831,6 +853,10 @@ impl Endpoint {
            cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
        }

+        if dev {
+            cmd.arg("--dev");
+        }
+
        let child = cmd.spawn()?;
        // set up a scopeguard to kill & wait for the child in case we panic or bail below
        let child = scopeguard::guard(child, |mut child| {
@@ -883,7 +909,7 @@ impl Endpoint {
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
                        | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::TerminationPending { .. }
                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
@@ -892,8 +918,7 @@ impl Endpoint {
                Err(e) => {
                    if Instant::now().duration_since(start_at) > start_timeout {
                        return Err(e).context(format!(
-                            "timed out {:?} waiting to connect to compute_ctl HTTP",
-                            start_timeout,
+                            "timed out {start_timeout:?} waiting to connect to compute_ctl HTTP",
                        ));
                    }
                }
@@ -932,7 +957,7 @@ impl Endpoint {
            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
            let url = response.url().to_owned();
            let msg = match response.text().await {
-                Ok(err_body) => format!("Error: {}", err_body),
+                Ok(err_body) => format!("Error: {err_body}"),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
            Err(anyhow::anyhow!(msg))
@@ -941,9 +966,10 @@ impl Endpoint {

    pub async fn reconfigure(
        &self,
-        mut pageservers: Vec<(Host, u16)>,
+        pageserver_conninfo: Option<PageserverConnectionInfo>,
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
+        safekeeper_generation: Option<SafekeeperGeneration>,
    ) -> Result<()> {
        let (mut spec, compute_ctl_config) = {
            let config_path = self.endpoint_path().join("config.json");
@@ -956,26 +982,15 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        // If we weren't given explicit pageservers, query the storage controller
-        if pageservers.is_empty() {
-            let storage_controller = StorageController::from_env(&self.env);
-            let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
-            pageservers = locate_result
-                .shards
-                .into_iter()
-                .map(|shard| {
-                    (
-                        Host::parse(&shard.listen_pg_addr)
-                            .expect("Storage controller reported bad hostname"),
-                        shard.listen_pg_port,
-                    )
-                })
-                .collect::<Vec<_>>();
+        if let Some(pageserver_conninfo) = pageserver_conninfo {
+            // If pageservers are provided, we need to ensure that they are not empty.
+            // This is a requirement for the compute_ctl configuration.
+            anyhow::ensure!(
+                !pageserver_conninfo.shards.is_empty(),
+                "no pageservers provided"
+            );
+            spec.pageserver_connection_info = Some(pageserver_conninfo);
        }
-
-        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
-        assert!(!pageserver_connstr.is_empty());
-        spec.pageserver_connstring = Some(pageserver_connstr);
        if stripe_size.is_some() {
            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }
@@ -984,6 +999,9 @@ impl Endpoint {
        if let Some(safekeepers) = safekeepers {
            let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
            spec.safekeeper_connstrings = safekeeper_connstrings;
+            if let Some(g) = safekeeper_generation {
+                spec.safekeepers_generation = Some(g.into_inner());
+            }
        }

        let client = reqwest::Client::builder()
@@ -1014,15 +1032,52 @@ impl Endpoint {
        } else {
            let url = response.url().to_owned();
            let msg = match response.text().await {
-                Ok(err_body) => format!("Error: {}", err_body),
+                Ok(err_body) => format!("Error: {err_body}"),
                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
            };
            Err(anyhow::anyhow!(msg))
        }
    }

-    pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
-        self.pg_ctl(&["-m", mode, "stop"], &None)?;
+    pub async fn reconfigure_pageservers(
+        &self,
+        pageservers: PageserverConnectionInfo,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> Result<()> {
+        self.reconfigure(Some(pageservers), stripe_size, None, None)
+            .await
+    }
+
+    pub async fn reconfigure_safekeepers(
+        &self,
+        safekeepers: Vec<NodeId>,
+        generation: SafekeeperGeneration,
+    ) -> Result<()> {
+        self.reconfigure(None, None, Some(safekeepers), Some(generation))
+            .await
+    }
+
+    pub async fn stop(
+        &self,
+        mode: EndpointTerminateMode,
+        destroy: bool,
+    ) -> Result<TerminateResponse> {
+        // pg_ctl stop is fast but doesn't allow us to collect LSN. /terminate is
+        // slow, and test runs time out. Solution: special mode "immediate-terminate"
+        // which uses /terminate
+        let response = if let EndpointTerminateMode::ImmediateTerminate = mode {
+            let ip = self.external_http_address.ip();
+            let port = self.external_http_address.port();
+            let url = format!("http://{ip}:{port}/terminate?mode=immediate");
+            let token = self.generate_jwt(Some(ComputeClaimsScope::Admin))?;
+            let request = reqwest::Client::new().post(url).bearer_auth(token);
+            let response = request.send().await.context("/terminate")?;
+            let text = response.text().await.context("/terminate result")?;
+            serde_json::from_str(&text).with_context(|| format!("deserializing {text}"))?
+        } else {
+            self.pg_ctl(&["-m", &mode.to_string(), "stop"], &None)?;
+            TerminateResponse { lsn: None }
+        };

        // Also wait for the compute_ctl process to die. It might have some
        // cleanup work to do after postgres stops, like syncing safekeepers,
@@ -1032,7 +1087,7 @@ impl Endpoint {
        // waiting. Sometimes we do *not* want this cleanup: tests intentionally
        // do stop when majority of safekeepers is down, so sync-safekeepers
        // would hang otherwise. This could be a separate flag though.
-        let send_sigterm = destroy || mode == "immediate";
+        let send_sigterm = destroy || !matches!(mode, EndpointTerminateMode::Fast);
        self.wait_for_compute_ctl_to_exit(send_sigterm)?;
        if destroy {
            println!(
@@ -1041,7 +1096,7 @@ impl Endpoint {
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
        }
-        Ok(())
+        Ok(response)
    }

    pub fn connstr(&self, user: &str, db_name: &str) -> String {
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -12,9 +12,11 @@ use std::{env, fs};

 use anyhow::{Context, bail};
 use clap::ValueEnum;
+use pageserver_api::config::PostHogConfig;
 use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Certificate, Url};
+use safekeeper_api::PgMajorVersion;
 use serde::{Deserialize, Serialize};
 use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
@@ -209,6 +211,12 @@ pub struct NeonStorageControllerConf {
    pub use_https_safekeeper_api: bool,

    pub use_local_compute_notifications: bool,
+
+    pub timeline_safekeeper_count: Option<usize>,
+
+    pub posthog_config: Option<PostHogConfig>,
+
+    pub kick_secondary_downloads: Option<bool>,
 }

 impl NeonStorageControllerConf {
@@ -236,9 +244,12 @@ impl Default for NeonStorageControllerConf {
            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
            long_reconcile_threshold: None,
            use_https_pageserver_api: false,
-            timelines_onto_safekeepers: false,
+            timelines_onto_safekeepers: true,
            use_https_safekeeper_api: false,
            use_local_compute_notifications: true,
+            timeline_safekeeper_count: None,
+            posthog_config: None,
+            kick_secondary_downloads: None,
        }
    }
 }
@@ -254,7 +265,7 @@ impl Default for EndpointStorageConf {
 impl NeonBroker {
    pub fn client_url(&self) -> Url {
        let url = if let Some(addr) = self.listen_https_addr {
-            format!("https://{}", addr)
+            format!("https://{addr}")
        } else {
            format!(
                "http://{}",
@@ -418,25 +429,21 @@ impl LocalEnv {
        self.pg_distrib_dir.clone()
    }

-    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        #[allow(clippy::manual_range_patterns)]
-        match pg_version {
-            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
-            _ => bail!("Unsupported postgres version: {}", pg_version),
-        }
+        Ok(path.join(pg_version.v_str()))
    }

-    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
+    pub fn pg_dir(&self, pg_version: PgMajorVersion, dir_name: &str) -> anyhow::Result<PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
        self.pg_dir(pg_version, "bin")
    }

-    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
        self.pg_dir(pg_version, "lib")
    }

@@ -727,7 +734,7 @@ impl LocalEnv {
                let config_toml_path = dentry.path().join("pageserver.toml");
                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
                    &std::fs::read_to_string(&config_toml_path)
-                        .with_context(|| format!("read {:?}", config_toml_path))?,
+                        .with_context(|| format!("read {config_toml_path:?}"))?,
                )
                .context("parse pageserver.toml")?;
                let identity_toml_path = dentry.path().join("identity.toml");
@@ -737,7 +744,7 @@ impl LocalEnv {
                }
                let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
                    &std::fs::read_to_string(&identity_toml_path)
-                        .with_context(|| format!("read {:?}", identity_toml_path))?,
+                        .with_context(|| format!("read {identity_toml_path:?}"))?,
                )
                .context("parse identity.toml")?;
                let PageserverConfigTomlSubset {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -16,11 +16,13 @@ use std::time::Duration;

 use anyhow::{Context, bail};
 use camino::Utf8PathBuf;
+use pageserver_api::config::{DEFAULT_GRPC_LISTEN_PORT, DEFAULT_HTTP_LISTEN_PORT};
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{PgConnectionConfig, parse_host_port};
+use safekeeper_api::PgMajorVersion;
 use utils::auth::{Claims, Scope};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -120,7 +122,7 @@ impl PageServerNode {
                .env
                .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
                .unwrap();
-            overrides.push(format!("control_plane_api_token='{}'", jwt_token));
+            overrides.push(format!("control_plane_api_token='{jwt_token}'"));
        }

        if !conf.other.contains_key("remote_storage") {
@@ -252,9 +254,10 @@ impl PageServerNode {
        // the storage controller
        let metadata_path = datadir.join("metadata.json");

-        let (_http_host, http_port) =
+        let http_host = "localhost".to_string();
+        let (_, http_port) =
            parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
-        let http_port = http_port.unwrap_or(9898);
+        let http_port = http_port.unwrap_or(DEFAULT_HTTP_LISTEN_PORT);

        let https_port = match self.conf.listen_https_addr.as_ref() {
            Some(https_addr) => {
@@ -265,6 +268,13 @@ impl PageServerNode {
            None => None,
        };

+        let (mut grpc_host, mut grpc_port) = (None, None);
+        if let Some(grpc_addr) = &self.conf.listen_grpc_addr {
+            let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr");
+            grpc_host = Some("localhost".to_string());
+            grpc_port = Some(port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT));
+        }
+
        // Intentionally hand-craft JSON: this acts as an implicit format compat test
        // in case the pageserver-side structure is edited, and reflects the real life
        // situation: the metadata is written by some other script.
@@ -273,7 +283,9 @@ impl PageServerNode {
            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
                postgres_host: "localhost".to_string(),
                postgres_port: self.pg_connection_config.port(),
-                http_host: "localhost".to_string(),
+                grpc_host,
+                grpc_port,
+                http_host,
                http_port,
                https_port,
                other: HashMap::from([(
@@ -596,7 +608,7 @@ impl PageServerNode {
        timeline_id: TimelineId,
        base: (Lsn, PathBuf),
        pg_wal: Option<(Lsn, PathBuf)>,
-        pg_version: u32,
+        pg_version: PgMajorVersion,
    ) -> anyhow::Result<()> {
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -143,7 +143,7 @@ impl SafekeeperNode {
        let id_string = id.to_string();
        // TODO: add availability_zone to the config.
        // Right now we just specify any value here and use it to check metrics in tests.
-        let availability_zone = format!("sk-{}", id_string);
+        let availability_zone = format!("sk-{id_string}");

        let mut args = vec![
            "-D".to_owned(),
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -6,6 +6,8 @@ use std::str::FromStr;
 use std::sync::OnceLock;
 use std::time::{Duration, Instant};

+use crate::background_process;
+use crate::local_env::{LocalEnv, NeonStorageControllerConf};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper0::Uri;
 use nix::unistd::Pid;
@@ -22,6 +24,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Method, Response};
+use safekeeper_api::PgMajorVersion;
 use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use tokio::process::Command;
@@ -31,9 +34,6 @@ use utils::auth::{Claims, Scope, encode_from_key_file};
 use utils::id::{NodeId, TenantId};
 use whoami::username;

-use crate::background_process;
-use crate::local_env::{LocalEnv, NeonStorageControllerConf};
-
 pub struct StorageController {
    env: LocalEnv,
    private_key: Option<Pem>,
@@ -48,7 +48,7 @@ pub struct StorageController {

 const COMMAND: &str = "storage_controller";

-const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
+const STORAGE_CONTROLLER_POSTGRES_VERSION: PgMajorVersion = PgMajorVersion::PG16;

 const DB_NAME: &str = "storage_controller";

@@ -167,7 +167,7 @@ impl StorageController {
    fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
        self.env
            .base_data_dir
-            .join(format!("storage_controller_{}", instance_id))
+            .join(format!("storage_controller_{instance_id}"))
    }

    fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
@@ -184,9 +184,15 @@ impl StorageController {
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
-        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];
+        const PREFER_VERSIONS: [PgMajorVersion; 5] = [
+            STORAGE_CONTROLLER_POSTGRES_VERSION,
+            PgMajorVersion::PG16,
+            PgMajorVersion::PG15,
+            PgMajorVersion::PG14,
+            PgMajorVersion::PG17,
+        ];

-        for v in prefer_versions {
+        for v in PREFER_VERSIONS {
            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
            if tokio::fs::try_exists(&path).await? {
                return Ok(path);
@@ -220,7 +226,7 @@ impl StorageController {
            "-d",
            DB_NAME,
            "-p",
-            &format!("{}", postgres_port),
+            &format!("{postgres_port}"),
        ];
        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
        let envs = [
@@ -263,7 +269,7 @@ impl StorageController {
                "-h",
                "localhost",
                "-p",
-                &format!("{}", postgres_port),
+                &format!("{postgres_port}"),
                "-U",
                &username(),
                "-O",
@@ -425,7 +431,7 @@ impl StorageController {
            // from `LocalEnv`'s config file (`.neon/config`).
            tokio::fs::write(
                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}\nfsync=off\n", postgres_port),
+                format!("port = {postgres_port}\nfsync=off\n"),
            )
            .await?;

@@ -477,7 +483,7 @@ impl StorageController {
            self.setup_database(postgres_port).await?;
        }

-        let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
+        let database_url = format!("postgresql://localhost:{postgres_port}/{DB_NAME}");

        // We support running a startup SQL script to fiddle with the database before we launch storcon.
        // This is used by the test suite.
@@ -508,7 +514,7 @@ impl StorageController {
        drop(client);
        conn.await??;

-        let addr = format!("{}:{}", host, listen_port);
+        let addr = format!("{host}:{listen_port}");
        let address_for_peers = Uri::builder()
            .scheme(scheme)
            .authority(addr.clone())
@@ -557,6 +563,10 @@ impl StorageController {
            args.push("--use-local-compute-notifications".to_string());
        }

+        if let Some(value) = self.config.kick_secondary_downloads {
+            args.push(format!("--kick-secondary-downloads={value}"));
+        }
+
        if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
            args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
        }
@@ -628,6 +638,28 @@ impl StorageController {
            args.push("--timelines-onto-safekeepers".to_string());
        }

+        // neon_local is used in test environments where we often have less than 3 safekeepers.
+        if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 {
+            let sk_cnt = self
+                .config
+                .timeline_safekeeper_count
+                .unwrap_or(self.env.safekeepers.len());
+
+            args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
+        }
+
+        let mut envs = vec![
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+
+        if let Some(posthog_config) = &self.config.posthog_config {
+            envs.push((
+                "POSTHOG_CONFIG".to_string(),
+                serde_json::to_string(posthog_config)?,
+            ));
+        }
+
        println!("Starting storage controller");

        background_process::start_process(
@@ -635,10 +667,7 @@ impl StorageController {
            &instance_dir,
            &self.env.storage_controller_bin(),
            args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
+            envs,
            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
            &start_args.start_timeout,
            || async {
@@ -802,9 +831,9 @@ impl StorageController {
            builder = builder.json(&body)
        }
        if let Some(private_key) = &self.private_key {
-            println!("Getting claims for path {}", path);
+            println!("Getting claims for path {path}");
            if let Some(required_claims) = Self::get_claims_for_path(&path)? {
-                println!("Got claims {:?} for path {}", required_claims, path);
+                println!("Got claims {required_claims:?} for path {path}");
                let jwt_token = encode_from_key_file(&required_claims, private_key)?;
                builder = builder.header(
                    reqwest::header::AUTHORIZATION,
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -36,6 +36,10 @@ enum Command {
        listen_pg_addr: String,
        #[arg(long)]
        listen_pg_port: u16,
+        #[arg(long)]
+        listen_grpc_addr: Option<String>,
+        #[arg(long)]
+        listen_grpc_port: Option<u16>,

        #[arg(long)]
        listen_http_addr: String,
@@ -418,6 +422,8 @@ async fn main() -> anyhow::Result<()> {
            node_id,
            listen_pg_addr,
            listen_pg_port,
+            listen_grpc_addr,
+            listen_grpc_port,
            listen_http_addr,
            listen_http_port,
            listen_https_port,
@@ -431,6 +437,8 @@ async fn main() -> anyhow::Result<()> {
                        node_id,
                        listen_pg_addr,
                        listen_pg_port,
+                        listen_grpc_addr,
+                        listen_grpc_port,
                        listen_http_addr,
                        listen_http_port,
                        listen_https_port,
@@ -641,7 +649,7 @@ async fn main() -> anyhow::Result<()> {
                response
                    .new_shards
                    .iter()
-                    .map(|s| format!("{:?}", s))
+                    .map(|s| format!("{s:?}"))
                    .collect::<Vec<_>>()
                    .join(",")
            );
@@ -763,8 +771,8 @@ async fn main() -> anyhow::Result<()> {

            println!("Tenant {tenant_id}");
            let mut table = comfy_table::Table::new();
-            table.add_row(["Policy", &format!("{:?}", policy)]);
-            table.add_row(["Stripe size", &format!("{:?}", stripe_size)]);
+            table.add_row(["Policy", &format!("{policy:?}")]);
+            table.add_row(["Stripe size", &format!("{stripe_size:?}")]);
            table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]);
            println!("{table}");
            println!("Shards:");
@@ -781,7 +789,7 @@ async fn main() -> anyhow::Result<()> {
                let secondary = shard
                    .node_secondary
                    .iter()
-                    .map(|n| format!("{}", n))
+                    .map(|n| format!("{n}"))
                    .collect::<Vec<_>>()
                    .join(",");

@@ -855,7 +863,7 @@ async fn main() -> anyhow::Result<()> {
                }
            } else {
                // Make it obvious to the user that since they've omitted an AZ, we're clearing it
-                eprintln!("Clearing preferred AZ for tenant {}", tenant_id);
+                eprintln!("Clearing preferred AZ for tenant {tenant_id}");
            }

            // Construct a request that modifies all the tenant's shards
@@ -1126,8 +1134,7 @@ async fn main() -> anyhow::Result<()> {
                    Err((tenant_shard_id, from, to, error)) => {
                        failure += 1;
                        println!(
-                            "Failed to migrate {} from node {} to node {}: {}",
-                            tenant_shard_id, from, to, error
+                            "Failed to migrate {tenant_shard_id} from node {from} to node {to}: {error}"
                        );
                    }
                }
@@ -1269,8 +1276,7 @@ async fn main() -> anyhow::Result<()> {
            concurrency,
        } => {
            let mut path = format!(
-                "/v1/tenant/{}/timeline/{}/download_heatmap_layers",
-                tenant_shard_id, timeline_id,
+                "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
            );

            if let Some(c) = concurrency {
@@ -1295,8 +1301,7 @@ async fn watch_tenant_shard(
 ) -> anyhow::Result<()> {
    if let Some(until_migrated_to) = until_migrated_to {
        println!(
-            "Waiting for tenant shard {} to be migrated to node {}",
-            tenant_shard_id, until_migrated_to
+            "Waiting for tenant shard {tenant_shard_id} to be migrated to node {until_migrated_to}"
        );
    }

@@ -1319,7 +1324,7 @@ async fn watch_tenant_shard(
            "attached: {} secondary: {} {}",
            shard
                .node_attached
-                .map(|n| format!("{}", n))
+                .map(|n| format!("{n}"))
                .unwrap_or("none".to_string()),
            shard
                .node_secondary
@@ -1333,15 +1338,12 @@ async fn watch_tenant_shard(
                "(reconciler idle)"
            }
        );
-        println!("{}", summary);
+        println!("{summary}");

        // Maybe drop out if we finished migration
        if let Some(until_migrated_to) = until_migrated_to {
            if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling {
-                println!(
-                    "Tenant shard {} is now on node {}",
-                    tenant_shard_id, until_migrated_to
-                );
+                println!("Tenant shard {tenant_shard_id} is now on node {until_migrated_to}");
                break;
            }
        }
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -95,3 +95,4 @@ echo "Start compute node"
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-${RANDOM}"                          \
     --config "${CONFIG_FILE}"
+     --dev
--- a/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
+++ b/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
@@ -0,0 +1,396 @@
+# Memo: Endpoint Persistent Unlogged Files Storage
+Created on 2024-11-05
+Implemented on N/A
+
+## Summary
+A design for a storage system that allows storage of files required to make
+Neon's Endpoints have a better experience at or after a reboot.
+
+## Motivation
+Several systems inside PostgreSQL (and Neon) need some persistent storage for
+optimal workings across reboots and restarts, but still work without.
+Examples are the query-level statistics files of `pg_stat_statements` in
+`pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`.
+We need a storage system that can store and manage these files for each
+Endpoint, without necessarily granting users access to an unlimited storage
+device.
+
+## Goals
+- Store known files for Endpoints with reasonable persistence.  
+  _Data loss in this service, while annoying and bad for UX, won't lose any
+  customer's data._
+
+## Non Goals (if relevant)
+- This storage system does not need branching, file versioning, or other such
+  features. The files are as ephemeral to the timeline of the data as the
+  Endpoints that host the data.
+- This storage system does not need to store _all_ user files, only 'known'
+  user files.
+- This storage system does not need to be hosted fully inside Computes.  
+  _Instead, this will be a separate component similar to Pageserver,
+  SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._
+
+## Impacted components
+- Compute needs new code to load and store these files in its lifetime.
+- Control Plane needs to consider this new storage system when signalling
+  the deletion of an Endpoint, Timeline, or Tenant.
+- Control Plane needs to consider this new storage system when it resets
+  or re-assigns an endpoint's timeline/branch state.
+
+A new service is created: the Endpoint Persistent Unlogged Files Storage
+service.  This could be integrated in e.g. Pageserver or Control Plane, or a
+separately hosted service.
+
+## Proposed implementation
+Endpoint-related data files are managed by a newly designed service (which
+optionally is integrated in an existing service like Pageserver or Control
+Plane), which stores data directly into S3 or any blob storage of choice.
+
+Upon deletion of the Endpoint, or reassignment of the endpoint to a different
+branch, this ephemeral data is dropped: the data stored may not match the
+state of the branch's data after reassignment, and on endpoint deletion the
+data won't have any use to the user.
+
+Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims)
+which it can use to authenticate to this new service and retrieve and store
+data associated with this endpoint.  This limited scope reduces leaks of data
+across endpoints and timeline resets, and limits the ability of endpoints to
+mess with other endpoints' data.
+
+The path of this endpoint data in S3 is initially as follows:
+
+    s3://<regional-epufs-bucket>/
+      tenants/
+        <hex-tenant-id>/
+          tenants/
+            <hex-timeline-id>/
+              endpoints/
+                <endpoint-id>/
+                  pgdata/
+                    <file_path_in_pgdatadir>
+
+For other blob storages an equivalent or similar path can be constructed.
+
+### Reliability, failure modes and corner cases (if relevant)
+Reliability is important, but not critical to the workings of Neon.  The data
+stored in this service will, when lost, reduce performance, but won't be a
+cause of permanent data loss - only operational metadata is stored.
+
+Most, if not all, blob storage services have sufficiently high persistence
+guarantees to cater our need for persistence and uptime. The only concern with
+blob storages is that the access latency is generally higher than local disk,
+but for the object types stored (cache state, ...) I don't think this will be
+much of an issue.
+
+### Interaction/Sequence diagram (if relevant)
+
+In these diagrams you can replace S3 with any persistent storage device of
+choice, but S3 is chosen as representative name: The well-known and short name
+of AWS' blob storage. Azure Blob Storage should work too, but it has a much
+longer name making it less practical for the diagrams.
+
+Write data:
+
+```http
+POST /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "version": "<opaque>", # opaque file version token, changes when the file contents change
+  "size": <bytes>,
+}
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    co-->ep: Connect with credentials
+    co->>+ep: Store Unlogged Persistent File
+    opt is authenticated
+        ep->>s3: Write UPF to S3
+    end
+    ep->>-co: OK / Failure / Auth Failure
+    co-->ep: Cancel connection
+```
+
+Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since)
+```http
+GET /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+
+<file data>
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    co->>+ep: Read Unlogged Persistent File
+    opt is authenticated
+        ep->>+s3: Request UPF from storage
+        s3->>-ep: Receive UPF from storage
+    end
+    ep->>-co: OK(response) / Failure(storage, auth, ...)
+```
+
+Compute Startup:
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ps as Pageserver
+    participant ep as EPUFS
+    participant es as Extension server
+
+    note over co: Bind endpoint ep-xxx
+    par Get basebackup
+        co->>+ps: Request basebackup @ LSN
+        ps-)ps: Construct basebackup
+        ps->>-co: Receive basebackup TAR @ LSN
+    and Get startup-critical Unlogged Persistent Files
+        co->>+ep: Get all UPFs of endpoint ep-xxx
+        ep-)ep: Retrieve and gather all UPFs
+        ep->>-co: TAR of UPFs
+    and Get startup-critical extensions
+        loop For every startup-critical extension
+            co->>es: Get critical extension
+            es->>co: Receive critical extension
+        end
+    end
+    note over co: Start compute
+```
+
+CPlane ops:
+```http
+DELETE /tenants/<tenant-id>/timelines/<timeline-id>/endpoints/<endpoint-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "timeline": "<timeline-id>",
+  "endpoint": "<endpoint-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```http
+DELETE /tenants/<tenant-id>/timelines/<timeline-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "timeline": "<timeline-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```http
+DELETE /tenants/<tenant-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant cp as Control Plane
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    alt Tenant deleted
+        cp-)ep: Tenant deleted
+        loop For every object associated with removed tenant
+            ep->>s3: Remove data of deleted tenant from Storage
+        end
+        opt
+            ep-)cp: Tenant cleanup complete
+        end
+    alt Timeline deleted
+        cp-)ep: Timeline deleted
+        loop For every object associated with removed timeline
+            ep->>s3: Remove data of deleted timeline from Storage
+        end
+        opt
+            ep-)cp: Timeline cleanup complete
+        end
+    else Endpoint reassigned or removed
+        cp->>+ep: Endpoint reassigned
+        loop For every object associated with reassigned/removed endpoint
+            ep->>s3: Remove data from Storage
+        end
+        ep->>-cp: Cleanup complete
+    end
+```
+
+### Scalability (if relevant)
+
+Provisionally:  As this service is going to be part of compute startup, this
+service should be able to quickly respond to all requests.  Therefore this
+service is deployed to every AZ we host Computes in, and Computes communicate
+(generally) only to the EPUFS endpoint of the AZ they're hosted in.
+
+Local caching of frequently restarted endpoints' data or metadata may be
+needed for best performance.  However, due to the regional nature of stored
+data but zonal nature of the service deployment, we should be careful when we
+implement any local caching, as it is possible that computes in AZ 1 will
+update data originally written and thus cached by AZ 2.  Cache version tests
+and invalidation is therefore required if we want to roll out caching to this
+service, which is too broad a scope for an MVC.  This is why caching is left
+out of scope for this RFC, and should be considered separately after this RFC
+is implemented.
+
+### Security implications (if relevant)
+This service must be able to authenticate users at least by Tenant ID,
+Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of
+Compute, which will be upgraded to the extent needed to support Timeline- and
+Endpoint-based claims.
+
+The service requires unlimited access to (a prefix of) a blob storage bucket,
+and thus must be hosted outside the Compute VM sandbox.
+
+A service that generates pre-signed request URLs for Compute to download the
+data from that URL is likely problematic, too:  Compute would be able to write
+unlimited data to the bucket, or exfiltrate this signed URL to get read/write
+access to specific objects in this bucket, which would still effectively give
+users access to the S3 bucket (but with improved access logging).
+
+There may be a use case for transferring data associated with one endpoint to
+another endpoint (e.g. to make one endpoint warm its caches with the state of
+another endpoint), but that's not currently in scope, and specific needs may
+be solved through out-of-line communication of data or pre-signed URLs.
+
+### Unresolved questions (if relevant)
+Caching of files is not in the implementation scope of the document, but
+should at some future point be considered to maximize performance.
+
+## Alternative implementation (if relevant)
+Several ideas have come up to solve this issue:
+
+### Use AUXfile
+One prevalent idea was to WAL-log the files using our AUXfile mechanism.
+
+Benefits:
+
+ We already have this storage mechanism
+
+Demerits:
+
+- It isn't available on read replicas
+- Additional WAL will be consumed during shutdown and after the shutdown
+  checkpoint, which needs PG modifications to work without panics.
+- It increases the data we need to manage in our versioned storage, thus
+  causing higher storage costs with higher retention due to duplication at
+  the storage layer.
+
+### Sign URLs for read/write operations, instead of proxying them
+
+Benefits:
+
+ The service can be implemented with a much reduced IO budget
+
+Demerits:
+
+- Users could get access to these signed credentials
+- Not all blob storage services may implement URL signing
+
+### Give endpoints each their own directly accessed block volume
+
+Benefits:
+
+ Easier to integrate for PostgreSQL
+
+Demerits:
+
+- Little control on data size and contents
+- Potentially problematic as we'd need to store data all across the pgdata
+  directory.
+- EBS is not a good candidate
+   - Attaches in 10s of seconds, if not more; i.e. too cold to start
+   - Shared EBS volumes are a no-go, as you'd have to schedule the endpoint
+     with users of the same EBS volumes, which can't work with VM migration
+   - EBS storage costs are very high (>80$/kilotenant when using a
+     volume/tenant)
+   - EBS volumes can't be mounted across AZ boundaries
+- Bucket per endpoint is unfeasible
+   - S3 buckets are priced at $20/month per 1k, which we could better spend
+     on developers.
+   - Allocating service accounts takes time (100s of ms), and service accounts
+     are a limited resource, too; so they're not a good candidate to allocate
+     on a per-endpoint basis.
+   - Giving credentials limited to prefix has similar issues as the pre-signed
+     URL approach.
+   - Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup
+     much more than our current systems would.
+- Volumes bound by hypervisor are unlikely
+   - This requires significant investment and increased software on the
+     hypervisor.
+   - It is unclear if we can attach volumes after boot, i.e. for pooled
+     instances.
+
+### Put the files into a table
+
+Benefits:
+
+ + Mostly already available in PostgreSQL
+
+Demerits:
+
+ - Uses WAL
+   - Can't be used after shutdown checkpoint
+   - Needs a RW endpoint, and table & catalog access to write to this data
+ - Gets hit with DB size limitations
+ - Depending on user acces:
+   - Inaccessible:  
+     The user doesn't have control over database size caused by
+     these systems.
+   - Accessible:  
+     The user can corrupt these files and cause the system to crash while
+     user-corrupted files are present, thus increasing on-call overhead.
+
+## Definition of Done (if relevant)
+
+This project is done if we have:
+
+- One S3 bucket equivalent per region, which stores this per-endpoint data.
+- A new service endpoint in at least every AZ, which indirectly grants
+  endpoints access to the data stored for these endpoints in these buckets.
+- Compute writes & reads temp-data at shutdown and startup, respectively, for
+  at least the pg_prewarm or lfc_prewarm state files.
+- Cleanup of endpoint data is triggered when the endpoint is deleted or is
+  detached from its current timeline.
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -374,7 +374,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            let request = Request::builder()
                .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key"))
                .method(method)
-                .header("Authorization", format!("Bearer {}", token))
+                .header("Authorization", format!("Bearer {token}"))
                .body(Body::empty())
                .unwrap();
            let status = ServiceExt::ready(&mut app)
--- a/endpoint_storage/src/main.rs
+++ b/endpoint_storage/src/main.rs
@@ -31,13 +31,12 @@ struct Args {
 }

 #[derive(serde::Deserialize)]
-#[serde(tag = "type")]
 struct Config {
    #[serde(default = "listen")]
    listen: std::net::SocketAddr,
    pemfile: camino::Utf8PathBuf,
    #[serde(flatten)]
-    storage_config: remote_storage::RemoteStorageConfig,
+    storage_kind: remote_storage::TypedRemoteStorageKind,
    #[serde(default = "max_upload_file_limit")]
    max_upload_file_limit: usize,
 }
@@ -70,7 +69,8 @@ async fn main() -> anyhow::Result<()> {
    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
    info!("listening on {}", listener.local_addr().unwrap());

-    let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
+    let storage =
+        remote_storage::GenericRemoteStorage::from_storage_kind(config.storage_kind).await?;
    let cancel = tokio_util::sync::CancellationToken::new();
    if !args.no_s3_check_on_startup {
        app::check_storage_permissions(&storage, cancel.clone()).await?;
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -12,6 +12,7 @@ jsonwebtoken.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 regex.workspace = true
+url.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -16,6 +16,7 @@ pub static COMPUTE_AUDIENCE: &str = "compute";
 pub enum ComputeClaimsScope {
    /// An admin-scoped token allows access to all of `compute_ctl`'s authorized
    /// facilities.
+    #[serde(rename = "compute_ctl:admin")]
    Admin,
 }

@@ -24,7 +25,7 @@ impl FromStr for ComputeClaimsScope {

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
-            "admin" => Ok(ComputeClaimsScope::Admin),
+            "compute_ctl:admin" => Ok(ComputeClaimsScope::Admin),
            _ => Err(anyhow::anyhow!("invalid compute claims scope \"{s}\"")),
        }
    }
@@ -80,3 +81,23 @@ pub struct SetRoleGrantsRequest {
    pub privileges: Vec<Privilege>,
    pub role: PgIdent,
 }
+
+#[cfg(test)]
+mod test {
+    use std::str::FromStr;
+
+    use crate::requests::ComputeClaimsScope;
+
+    /// Confirm that whether we parse the scope by string or through serde, the
+    /// same values parse to the same enum variant.
+    #[test]
+    fn compute_request_scopes() {
+        const ADMIN_SCOPE: &str = "compute_ctl:admin";
+
+        let from_serde: ComputeClaimsScope =
+            serde_json::from_str(&format!("\"{ADMIN_SCOPE}\"")).unwrap();
+        let from_str = ComputeClaimsScope::from_str(ADMIN_SCOPE).unwrap();
+
+        assert_eq!(from_serde, from_str);
+    }
+}
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -83,6 +83,16 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum TerminateMode {
+    #[default]
+    /// wait 30s till returning from /terminate to allow control plane to get the error
+    Fast,
+    /// return from /terminate immediately as soon as all components are terminated
+    Immediate,
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -103,11 +113,16 @@ pub enum ComputeStatus {
    // control-plane to terminate it.
    Failed,
    // Termination requested
-    TerminationPending,
+    TerminationPending { mode: TerminateMode },
    // Terminated Postgres
    Terminated,
 }

+#[derive(Deserialize, Serialize)]
+pub struct TerminateResponse {
+    pub lsn: Option<utils::lsn::Lsn>,
+}
+
 impl Display for ComputeStatus {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
@@ -117,7 +132,7 @@ impl Display for ComputeStatus {
            ComputeStatus::Running => f.write_str("running"),
            ComputeStatus::Configuration => f.write_str("configuration"),
            ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending => f.write_str("termination-pending"),
+            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
            ComputeStatus::Terminated => f.write_str("terminated"),
        }
    }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -4,11 +4,14 @@
 //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
 //! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;
+use std::fmt::Display;

+use anyhow::anyhow;
 use indexmap::IndexMap;
 use regex::Regex;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
+use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -102,7 +105,11 @@ pub struct ComputeSpec {
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,
-    pub pageserver_connstring: Option<String>,
+
+    // Pageserver information can be passed in two different ways:
+    // 1. Here
+    // 2. in cluster.settings. This is legacy, we are switching to method 1.
+    pub pageserver_connection_info: Option<PageserverConnectionInfo>,

    // More neon ids that we expose to the compute_ctl
    // and to postgres as neon extension GUCs.
@@ -202,6 +209,20 @@ pub enum ComputeFeature {
    UnknownFeature,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverConnectionInfo {
+    pub shards: HashMap<u32, PageserverShardConnectionInfo>,
+
+    pub prefer_grpc: bool,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverShardConnectionInfo {
+    pub libpq_url: Option<String>,
+    pub grpc_url: Option<String>,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -319,6 +340,12 @@ impl ComputeMode {
    }
 }

+impl Display for ComputeMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.to_type_str())
+    }
+}
+
 /// Log level for audit logging
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
@@ -429,6 +456,47 @@ pub struct JwksSettings {
    pub jwt_audience: Option<String>,
 }

+/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
+#[derive(Clone, Copy, Debug, Default)]
+pub enum PageserverProtocol {
+    /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
+    #[default]
+    Libpq,
+    /// A newer, gRPC-based protocol. Uses grpc:// scheme.
+    Grpc,
+}
+
+impl PageserverProtocol {
+    /// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given.
+    /// Errors if the connstring is an invalid URL.
+    pub fn from_connstring(connstring: &str) -> anyhow::Result<Self> {
+        let scheme = match Url::parse(connstring) {
+            Ok(url) => url.scheme().to_lowercase(),
+            Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()),
+            Err(err) => return Err(anyhow!("invalid connstring URL: {err}")),
+        };
+        match scheme.as_str() {
+            "postgresql" | "postgres" => Ok(Self::Libpq),
+            "grpc" => Ok(Self::Grpc),
+            scheme => Err(anyhow!("invalid protocol scheme: {scheme}")),
+        }
+    }
+
+    /// Returns the URL scheme for the protocol, for use in connstrings.
+    pub fn scheme(&self) -> &'static str {
+        match self {
+            Self::Libpq => "postgresql",
+            Self::Grpc => "grpc",
+        }
+    }
+}
+
+impl Display for PageserverProtocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.scheme())
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::fs::File;
--- a/libs/desim/src/executor.rs
+++ b/libs/desim/src/executor.rs
@@ -71,7 +71,7 @@ impl Runtime {
                    debug!("thread panicked: {:?}", e);
                    let mut result = ctx.result.lock();
                    if result.0 == -1 {
-                        *result = (256, format!("thread panicked: {:?}", e));
+                        *result = (256, format!("thread panicked: {e:?}"));
                    }
                });
            }
@@ -419,13 +419,13 @@ pub fn now() -> u64 {
    with_thread_context(|ctx| ctx.clock.get().unwrap().now())
 }

-pub fn exit(code: i32, msg: String) {
+pub fn exit(code: i32, msg: String) -> ! {
    with_thread_context(|ctx| {
        ctx.allow_panic.store(true, Ordering::SeqCst);
        let mut result = ctx.result.lock();
        *result = (code, msg);
        panic!("exit");
-    });
+    })
 }

 pub(crate) fn get_thread_ctx() -> Arc<ThreadContext> {
--- a/libs/desim/src/proto.rs
+++ b/libs/desim/src/proto.rs
@@ -47,8 +47,8 @@ impl Debug for AnyMessage {
        match self {
            AnyMessage::None => write!(f, "None"),
            AnyMessage::InternalConnect => write!(f, "InternalConnect"),
-            AnyMessage::Just32(v) => write!(f, "Just32({})", v),
-            AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v),
+            AnyMessage::Just32(v) => write!(f, "Just32({v})"),
+            AnyMessage::ReplCell(v) => write!(f, "ReplCell({v:?})"),
            AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)),
            AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)),
        }
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -582,14 +582,14 @@ pub fn attach_openapi_ui(
                            deepLinking: true,
                            showExtensions: true,
                            showCommonExtensions: true,
-                            url: "{}",
+                            url: "{spec_mount_path}",
                        }})
                        window.ui = ui;
                    }};
                </script>
                </body>
                </html>
-            "#, spec_mount_path))).unwrap())
+            "#))).unwrap())
             })
        )
 }
@@ -696,7 +696,7 @@ mod tests {
        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
        let mut service = builder.build(remote_addr);
        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
-            panic!("request service is not ready: {:?}", e);
+            panic!("request service is not ready: {e:?}");
        }

        let mut req: Request<Body> = Request::default();
@@ -716,7 +716,7 @@ mod tests {
        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
        let mut service = builder.build(remote_addr);
        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
-            panic!("request service is not ready: {:?}", e);
+            panic!("request service is not ready: {e:?}");
        }

        let req: Request<Body> = Request::default();
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,13 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
+spin.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
+
 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,367 @@
+//! Hash table implementation on top of 'shmem'
+//!
+//! Features required in the long run by the communicator project:
+//!
+//! [X] Accessible from both Postgres processes and rust threads in the communicator process
+//! [X] Low latency
+//! [ ] Scalable to lots of concurrent accesses (currently uses a single spinlock)
+//! [ ] Resizable
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+
+use crate::shmem::ShmemHandle;
+
+use spin;
+
+mod core;
+
+#[cfg(test)]
+mod tests;
+
+use core::CoreHashMap;
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub struct HashMapInit<'a, K, V> {
+    // Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+pub struct HashMapAccess<'a, K, V> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+unsafe impl<'a, K: Sync, V: Sync> Sync for HashMapAccess<'a, K, V> {}
+unsafe impl<'a, K: Send, V: Send> Send for HashMapAccess<'a, K, V> {}
+
+impl<'a, K, V> HashMapInit<'a, K, V> {
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V> {
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+        }
+    }
+
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V> {
+        // no difference to attach_writer currently
+        self.attach_writer()
+    }
+}
+
+// This is stored in the shared memory area
+struct HashMapShared<'a, K, V> {
+    inner: spin::RwLock<CoreHashMap<'a, K, V>>,
+}
+
+impl<'a, K, V> HashMapInit<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+    pub fn init_in_fixed_area(
+        num_buckets: u32,
+        area: &'a mut [MaybeUninit<u8>],
+    ) -> HashMapInit<'a, K, V> {
+        Self::init_common(num_buckets, None, area.as_mut_ptr().cast(), area.len())
+    }
+
+    /// Initialize a new hash map in the given shared memory area
+    pub fn init_in_shmem(num_buckets: u32, mut shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
+        let size = Self::estimate_size(num_buckets);
+        shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+
+        let ptr = unsafe { shmem.data_ptr.as_mut() };
+        Self::init_common(num_buckets, Some(shmem), ptr, size)
+    }
+
+    fn init_common(
+        num_buckets: u32,
+        shmem_handle: Option<ShmemHandle>,
+        area_ptr: *mut u8,
+        area_len: usize,
+    ) -> HashMapInit<'a, K, V> {
+        // carve out HashMapShared from the area. This does not include the hashmap's dictionary
+        // and buckets.
+        let mut ptr: *mut u8 = area_ptr;
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+
+        // the rest of the space is given to the hash map's dictionary and buckets
+        let remaining_area = unsafe {
+            std::slice::from_raw_parts_mut(ptr, area_len - ptr.offset_from(area_ptr) as usize)
+        };
+
+        let hashmap = CoreHashMap::new(num_buckets, remaining_area);
+        unsafe {
+            std::ptr::write(
+                shared_ptr,
+                HashMapShared {
+                    inner: spin::RwLock::new(hashmap),
+                },
+            );
+        }
+
+        HashMapInit {
+            shmem_handle,
+            shared_ptr,
+        }
+    }
+}
+
+impl<'a, K, V> HashMapAccess<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, K, V>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let lock_guard = map.inner.read();
+
+        match lock_guard.get(key) {
+            None => None,
+            Some(val_ref) => {
+                let val_ptr = std::ptr::from_ref(val_ref);
+                Some(ValueReadGuard {
+                    _lock_guard: lock_guard,
+                    value: val_ptr,
+                })
+            }
+        }
+    }
+
+    /// Insert a value
+    pub fn insert(&self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(&self, key: &K) -> bool {
+        let mut result = false;
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    pub fn update_with_fn<F>(&self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+
+        let old_val = lock_guard.get(key);
+        let action = value_fn(old_val);
+        match (old_val, action) {
+            (_, UpdateAction::Nothing) => {}
+            (_, UpdateAction::Insert(new_val)) => {
+                let _ = lock_guard.insert(key, new_val);
+            }
+            (None, UpdateAction::Remove) => panic!("Remove action with no old value"),
+            (Some(_), UpdateAction::Remove) => {
+                let _ = lock_guard.remove(key);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    pub fn update_with_fn_at_bucket<F>(
+        &self,
+        pos: usize,
+        value_fn: F,
+    ) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+
+        let old_val = lock_guard.get_bucket(pos);
+        let action = value_fn(old_val.map(|(_k, v)| v));
+        match (old_val, action) {
+            (_, UpdateAction::Nothing) => {}
+            (_, UpdateAction::Insert(_new_val)) => panic!("cannot insert without key"),
+            (None, UpdateAction::Remove) => panic!("Remove action with no old value"),
+            (Some((key, _value)), UpdateAction::Remove) => {
+                let key = key.clone();
+                let _ = lock_guard.remove(&key);
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.read().get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map. (An Iterator might be nicer. The communicator's
+    /// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
+    /// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
+    pub fn get_bucket<'e>(&'e self, pos: usize) -> Option<ValueReadGuard<'e, K, V>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let lock_guard = map.inner.read();
+
+        match lock_guard.get_bucket(pos) {
+            None => None,
+            Some((_key, val_ref)) => {
+                let val_ptr = std::ptr::from_ref(val_ref);
+                Some(ValueReadGuard {
+                    _lock_guard: lock_guard,
+                    value: val_ptr,
+                })
+            }
+        }
+    }
+
+    // for metrics
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.read().buckets_in_use as usize
+    }
+
+    /// Grow
+    ///
+    /// 1. grow the underlying shared memory area
+    /// 2. Initialize new buckets. This overwrites the current dictionary
+    /// 3. Recalculate the dictionary
+    pub fn grow(&self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+        let inner = &mut *lock_guard;
+        let old_num_buckets = inner.buckets.len() as u32;
+
+        if num_buckets < old_num_buckets {
+            panic!("grow called with a smaller number of buckets");
+        }
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
+        // the dictionary!
+        let buckets_ptr = inner.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket_ptr = buckets_ptr.add(i as usize);
+                bucket_ptr.write(core::Bucket {
+                    hash: 0,
+                    next: if i < num_buckets {
+                        i + 1
+                    } else {
+                        inner.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+        // Recalculate the dictionary
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for item in dictionary.iter_mut() {
+            *item = core::INVALID_POS;
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..old_num_buckets as usize {
+            if buckets[i].inner.is_none() {
+                continue;
+            }
+            let pos: usize = (buckets[i].hash % dictionary.len() as u64) as usize;
+            buckets[i].next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        // Finally, update the CoreHashMap struct
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+        inner.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+    // TODO: Shrinking is a multi-step process that requires co-operation from the caller
+    //
+    // 1. The caller must first call begin_shrink(). That forbids allocation of higher-numbered
+    // buckets.
+    //
+    // 2. Next, the caller must evict all entries in higher-numbered buckets.
+    //
+    // 3. Finally, call finish_shrink(). This recomputes the dictionary and shrinks the underlying
+    //    shmem area
+}
+
+pub struct ValueReadGuard<'a, K, V> {
+    _lock_guard: spin::RwLockReadGuard<'a, CoreHashMap<'a, K, V>>,
+    value: *const V,
+}
+
+impl<'a, K, V> Deref for ValueReadGuard<'a, K, V> {
+    type Target = V;
+
+    fn deref(&self) -> &Self::Target {
+        // SAFETY: The `lock_guard` ensures that the underlying map (and thus the value pointed to
+        // by `value`) remains valid for the lifetime `'a`. The `value` has been obtained from a
+        // valid reference within the map.
+        unsafe { &*self.value }
+    }
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,233 @@
+//! Simple hash table with chaining
+//!
+//! # Resizing
+//!
+
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::mem::MaybeUninit;
+
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+// Bucket
+pub(crate) struct Bucket<K, V> {
+    pub(crate) hash: u64,
+    pub(crate) next: u32,
+    pub(crate) inner: Option<(K, V)>,
+}
+
+pub(crate) struct CoreHashMap<'a, K, V> {
+    pub(crate) dictionary: &'a mut [u32],
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+    pub(crate) free_head: u32,
+
+    // metrics
+    pub(crate) buckets_in_use: u32,
+}
+
+pub struct FullError();
+
+impl<'a, K, V> CoreHashMap<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    const FILL_FACTOR: f32 = 0.60;
+
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }
+
+    pub fn new(num_buckets: u32, area: &'a mut [u8]) -> CoreHashMap<'a, K, V> {
+        let len = area.len();
+
+        let mut ptr: *mut u8 = area.as_mut_ptr();
+        let end_ptr: *mut u8 = unsafe { area.as_mut_ptr().add(len) };
+
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<Bucket<K, V>>() * num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        let dictionary_ptr = ptr;
+
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        // Initialize the buckets
+        let buckets = {
+            let buckets_ptr: *mut MaybeUninit<Bucket<K, V>> = buckets_ptr.cast();
+            let buckets =
+                unsafe { std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize) };
+            for i in 0..buckets.len() {
+                buckets[i].write(Bucket {
+                    hash: 0,
+                    next: if i < buckets.len() - 1 {
+                        i as u32 + 1
+                    } else {
+                        INVALID_POS
+                    },
+                    inner: None,
+                });
+            }
+            // TODO: use std::slice::assume_init_mut() once it stabilizes
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }
+        };
+
+        // Initialize the dictionary
+        let dictionary = {
+            let dictionary_ptr: *mut MaybeUninit<u32> = dictionary_ptr.cast();
+            let dictionary =
+                unsafe { std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size as usize) };
+
+            for item in dictionary.iter_mut() {
+                item.write(INVALID_POS);
+            }
+            // TODO: use std::slice::assume_init_mut() once it stabilizes
+            unsafe {
+                std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+            }
+        };
+
+        CoreHashMap {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+        }
+    }
+
+    pub fn get(&self, key: &K) -> Option<&V> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+    pub fn insert(&mut self, key: &K, value: V) -> Result<(), FullError> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let first = self.dictionary[hash as usize % self.dictionary.len()];
+        if first == INVALID_POS {
+            // no existing entry
+            let pos = self.alloc_bucket(key.clone(), value, hash)?;
+            if pos == INVALID_POS {
+                return Err(FullError());
+            }
+            self.dictionary[hash as usize % self.dictionary.len()] = pos;
+            return Ok(());
+        }
+
+        let mut next = first;
+        loop {
+            let bucket = &mut self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if bucket_key == key {
+                // found existing entry, update its value
+                *bucket_value = value;
+                return Ok(());
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry found. Append to the chain
+                let pos = self.alloc_bucket(key.clone(), value, hash)?;
+                if pos == INVALID_POS {
+                    return Err(FullError());
+                }
+                self.buckets[next as usize].next = pos;
+                return Ok(());
+            }
+            next = bucket.next;
+        }
+    }
+
+    pub fn remove(&mut self, key: &K) -> Result<(), FullError> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        let mut prev_pos: u32 = INVALID_POS;
+        loop {
+            if next == INVALID_POS {
+                // no existing entry
+                return Ok(());
+            }
+            let bucket = &mut self.buckets[next as usize];
+            let (bucket_key, _) = bucket.inner.as_mut().expect("entry is in use");
+            if bucket_key == key {
+                // found existing entry, unlink it from the chain
+                if prev_pos == INVALID_POS {
+                    self.dictionary[hash as usize % self.dictionary.len()] = bucket.next;
+                } else {
+                    self.buckets[prev_pos as usize].next = bucket.next;
+                }
+
+                // and add it to the freelist
+                let bucket = &mut self.buckets[next as usize];
+                bucket.hash = 0;
+                bucket.inner = None;
+                bucket.next = self.free_head;
+                self.free_head = next;
+                self.buckets_in_use -= 1;
+                return Ok(());
+            }
+            prev_pos = next;
+            next = bucket.next;
+        }
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+    pub fn get_bucket(&self, pos: usize) -> Option<&(K, V)> {
+        if pos >= self.buckets.len() {
+            return None;
+        }
+
+        self.buckets[pos].inner.as_ref()
+    }
+
+    fn alloc_bucket(&mut self, key: K, value: V, hash: u64) -> Result<u32, FullError> {
+        let pos = self.free_head;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+
+        let bucket = &mut self.buckets[pos as usize];
+        self.free_head = bucket.next;
+        self.buckets_in_use += 1;
+
+        bucket.hash = hash;
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        Ok(pos)
+    }
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,220 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::UpdateAction;
+use crate::shmem::ShmemHandle;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
+    let w = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+
+    //eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    sut: &HashMapAccess<TestKey, TestValue>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    sut.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+#[test]
+fn random_ops() {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(100000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
+
+#[test]
+fn test_grow() {
+    const MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(1000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let mut rng = rand::rng();
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+
+    writer.grow(1500).unwrap();
+
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,4 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {} too large", max_size);
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {}", i);
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,418 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {max_size} too large");
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {i}");
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/neonart/Cargo.toml
+++ b/libs/neonart/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "neonart"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+crossbeam-utils.workspace = true
+spin.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
--- a/libs/neonart/src/algorithm.rs
+++ b/libs/neonart/src/algorithm.rs
@@ -0,0 +1,599 @@
+mod lock_and_version;
+pub(crate) mod node_ptr;
+mod node_ref;
+
+use std::vec::Vec;
+
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
+use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
+use crate::allocator::OutOfMemoryError;
+
+use crate::TreeWriteGuard;
+use crate::UpdateAction;
+use crate::allocator::ArtAllocator;
+use crate::epoch::EpochPin;
+use crate::{Key, Value};
+
+pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
+
+#[derive(Debug)]
+pub enum ArtError {
+    ConcurrentUpdate, // need to retry
+    OutOfMemory,
+}
+
+impl From<ConcurrentUpdateError> for ArtError {
+    fn from(_: ConcurrentUpdateError) -> ArtError {
+        ArtError::ConcurrentUpdate
+    }
+}
+
+impl From<OutOfMemoryError> for ArtError {
+    fn from(_: OutOfMemoryError) -> ArtError {
+        ArtError::OutOfMemory
+    }
+}
+
+pub fn new_root<V: Value>(
+    allocator: &impl ArtAllocator<V>,
+) -> Result<RootPtr<V>, OutOfMemoryError> {
+    node_ptr::new_root(allocator)
+}
+
+pub(crate) fn search<'e, K: Key, V: Value>(
+    key: &K,
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<&'e V> {
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
+            break result;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn iter_next<'e, V: Value>(
+    key: &[u8],
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<(Vec<u8>, &'e V)> {
+    loop {
+        let mut path = Vec::new();
+        let root_ref = NodeRef::from_root_ptr(root);
+
+        match next_recurse(key, &mut path, root_ref, epoch_pin) {
+            Ok(Some(v)) => {
+                assert_eq!(path.len(), key.len());
+                break Some((path, v));
+            }
+            Ok(None) => break None,
+            Err(ConcurrentUpdateError()) => {
+                // retry
+                continue;
+            }
+        }
+    }
+}
+
+pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &K,
+    value_fn: F,
+    root: RootPtr<V>,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), OutOfMemoryError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let value_fn_cell = std::cell::Cell::new(Some(value_fn));
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
+        let key_bytes = key.as_bytes();
+
+        match update_recurse(
+            key_bytes,
+            this_value_fn,
+            root_ref,
+            None,
+            None,
+            guard,
+            0,
+            key_bytes,
+        ) {
+            Ok(()) => break Ok(()),
+            Err(ArtError::ConcurrentUpdate) => {
+                continue; // retry
+            }
+            Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
+        }
+    }
+}
+
+// Error means you must retry.
+//
+// This corresponds to the 'lookupOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+fn lookup_recurse<'e, V: Value>(
+    key: &[u8],
+    node: NodeRef<'e, V>,
+    parent: Option<ReadLockedNodeRef<V>>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    if let Some(parent) = parent {
+        parent.read_unlock_or_restart()?;
+    }
+
+    // check if the prefix matches, may increment level
+    let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
+        prefix_len
+    } else {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    };
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), prefix_len);
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let key = &key[prefix_len..];
+
+    // find child (or leaf value)
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    match next_node {
+        None => Ok(None), // key not found
+        Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
+    }
+}
+
+#[allow(clippy::only_used_in_recursion)]
+fn next_recurse<'e, V: Value>(
+    min_key: &[u8],
+    path: &mut Vec<u8>,
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.extend_from_slice(prefix);
+    }
+
+    use std::cmp::Ordering;
+    let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
+    if comparison == Ordering::Less {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    }
+
+    if rnode.is_leaf() {
+        assert_eq!(path.len(), min_key.len());
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let mut min_key_byte = match comparison {
+        Ordering::Less => unreachable!(), // checked this above already
+        Ordering::Equal => min_key[path.len()],
+        Ordering::Greater => 0,
+    };
+
+    loop {
+        match rnode.find_next_child_or_restart(min_key_byte)? {
+            None => {
+                return Ok(None);
+            }
+            Some((key_byte, child_ref)) => {
+                let path_len = path.len();
+                path.push(key_byte);
+                let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
+                if result.is_some() {
+                    return Ok(result);
+                }
+                if key_byte == u8::MAX {
+                    return Ok(None);
+                }
+                path.truncate(path_len);
+                min_key_byte = key_byte + 1;
+            }
+        }
+    }
+}
+
+// This corresponds to the 'insertOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &[u8],
+    value_fn: F,
+    node: NodeRef<'e, V>,
+    rparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
+    level: usize,
+    orig_key: &[u8],
+) -> Result<(), ArtError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let rnode = node.read_lock_or_restart()?;
+
+    let prefix_match_len = rnode.prefix_matches(key);
+    if prefix_match_len.is_none() {
+        let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        match value_fn(None) {
+            UpdateAction::Nothing => {}
+            UpdateAction::Insert(new_value) => {
+                insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
+            }
+            UpdateAction::Remove => {
+                panic!("unexpected Remove action on insertion");
+            }
+        }
+        wnode.write_unlock();
+        wparent.write_unlock();
+        return Ok(());
+    }
+    let prefix_match_len = prefix_match_len.unwrap();
+    let key = &key[prefix_match_len..];
+    let level = level + prefix_match_len;
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), 0);
+        let (rparent, parent_key) = rparent.expect("root cannot be leaf");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        // safety: Now that we have acquired the write lock, we have exclusive access to the
+        // value. XXX: There might be concurrent reads though?
+        let value_mut = wnode.get_leaf_value_mut();
+
+        match value_fn(Some(value_mut)) {
+            UpdateAction::Nothing => {
+                wparent.write_unlock();
+                wnode.write_unlock();
+            }
+            UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
+            UpdateAction::Remove => {
+                guard.remember_obsolete_node(wnode.as_ptr());
+                wparent.delete_child(parent_key);
+                wnode.write_unlock_obsolete();
+
+                if let Some(rgrandparent) = rgrandparent {
+                    // FIXME: Ignore concurrency error. It doesn't lead to
+                    // corruption, but it means we might leak something. Until
+                    // another update cleans it up.
+                    let _ = cleanup_parent(wparent, rgrandparent, guard);
+                }
+            }
+        }
+
+        return Ok(());
+    }
+
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    if next_node.is_none() {
+        if rnode.is_full() {
+            let (rparent, parent_key) = rparent.expect("root node cannot become full");
+            let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+            let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+            match value_fn(None) {
+                UpdateAction::Nothing => {
+                    wnode.write_unlock();
+                    wparent.write_unlock();
+                }
+                UpdateAction::Insert(new_value) => {
+                    insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
+                    wparent.write_unlock();
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+        } else {
+            let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+            if let Some((rparent, _)) = rparent {
+                rparent.read_unlock_or_restart()?;
+            }
+            match value_fn(None) {
+                UpdateAction::Nothing => {}
+                UpdateAction::Insert(new_value) => {
+                    insert_to_node(&mut wnode, key, new_value, guard)?;
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+            wnode.write_unlock();
+        }
+        Ok(())
+    } else {
+        let next_child = next_node.unwrap(); // checked above it's not None
+        if let Some((ref rparent, _)) = rparent {
+            rparent.check_or_restart()?;
+        }
+
+        // recurse to next level
+        update_recurse(
+            &key[1..],
+            value_fn,
+            next_child,
+            Some((rnode, key[0])),
+            rparent,
+            guard,
+            level + 1,
+            orig_key,
+        )
+    }
+}
+
+#[derive(Clone)]
+enum PathElement {
+    Prefix(Vec<u8>),
+    KeyByte(u8),
+}
+
+impl std::fmt::Debug for PathElement {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
+            PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
+        }
+    }
+}
+
+pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
+    root: RootPtr<V>,
+    epoch_pin: &'_ EpochPin,
+    dst: &mut dyn std::io::Write,
+) {
+    let root_ref = NodeRef::from_root_ptr(root);
+
+    let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
+}
+
+// TODO: return an Err if writeln!() returns error, instead of unwrapping
+#[allow(clippy::only_used_in_recursion)]
+fn dump_recurse<'e, V: Value + std::fmt::Debug>(
+    path: &[PathElement],
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+    dst: &mut dyn std::io::Write,
+) -> Result<(), ConcurrentUpdateError> {
+    let indent = str::repeat(" ", level);
+
+    let rnode = node.read_lock_or_restart()?;
+    let mut path = Vec::from(path);
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.push(PathElement::Prefix(Vec::from(prefix)));
+    }
+
+    if rnode.is_leaf() {
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let val = unsafe { vptr.as_ref().unwrap() };
+        writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
+        return Ok(());
+    }
+
+    for key_byte in 0..=u8::MAX {
+        match rnode.find_child_or_restart(key_byte)? {
+            None => continue,
+            Some(child_ref) => {
+                let rchild = child_ref.read_lock_or_restart()?;
+                writeln!(
+                    dst,
+                    "{} {:?}, {}: prefix {:?}",
+                    indent,
+                    &path,
+                    key_byte,
+                    rchild.get_prefix()
+                )
+                .unwrap();
+
+                let mut child_path = path.clone();
+                child_path.push(PathElement::KeyByte(key_byte));
+
+                dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+///```text
+///        [fooba]r -> value
+///
+/// [foo]b -> [a]r  -> value
+///      e -> [ls]e -> value
+///```
+fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    node: &mut WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key: u8,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let old_node = node;
+    let old_prefix = old_node.get_prefix();
+    let common_prefix_len = common_prefix(key, old_prefix);
+
+    // Allocate a node for the new value.
+    let new_value_node = allocate_node_for_value(
+        &key[common_prefix_len + 1..],
+        value,
+        guard.tree_writer.allocator,
+    )?;
+
+    // Allocate a new internal node with the common prefix
+    // FIXME: deallocate 'new_value_node' on OOM
+    let mut prefix_node =
+        node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
+
+    // Add the old node and the new nodes to the new internal node
+    prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
+    prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
+
+    // Modify the prefix of the old child in place
+    old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
+
+    // replace the pointer in the parent
+    parent.replace_child(parent_key, prefix_node.into_ptr());
+
+    Ok(())
+}
+
+fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
+    wnode: &mut WriteLockedNodeRef<V>,
+    key: &[u8],
+    value: V,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    wnode.insert_child(key[0], value_child.into_ptr());
+    Ok(())
+}
+
+// On entry: 'parent' and 'node' are locked
+fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    wnode: WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key_byte: u8,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
+
+    // FIXME: deallocate 'bigger_node' on OOM
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    bigger_node.insert_new_child(key[0], value_child);
+
+    // Replace the pointer in the parent
+    parent.replace_child(parent_key_byte, bigger_node.into_ptr());
+
+    guard.remember_obsolete_node(wnode.as_ptr());
+    wnode.write_unlock_obsolete();
+
+    Ok(())
+}
+
+fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    wparent: WriteLockedNodeRef<V>,
+    rgrandparent: (ReadLockedNodeRef<V>, u8),
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let (rgrandparent, grandparent_key_byte) = rgrandparent;
+
+    // If the parent becomes completely empty after the deletion, remove the parent from the
+    // grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
+    // TODO: not implemented.
+
+    // If the parent has only one child, replace the parent with the remaining child. (This is not
+    // possible if the child's prefix field cannot absorb the parent's)
+    if wparent.num_children() == 1 {
+        // Try to lock the remaining child. This can fail if the child is updated
+        // concurrently.
+        let (key_byte, remaining_child) = wparent.find_remaining_child();
+
+        let mut wremaining_child = remaining_child.write_lock_or_restart()?;
+
+        if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
+            let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+
+            // Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
+            // remaining leaf. Proceed with the updates.
+
+            // Update the prefix on the remaining leaf
+            wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
+
+            // Replace the pointer in the grandparent to point directly to the remaining leaf
+            wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
+
+            // Mark the parent as deleted.
+            guard.remember_obsolete_node(wparent.as_ptr());
+            wparent.write_unlock_obsolete();
+            return Ok(());
+        }
+    }
+
+    // If the parent's children would fit on a smaller node type after the deletion, replace it with
+    // a smaller node.
+    if wparent.can_shrink() {
+        let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+        let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
+
+        // Replace the pointer in the grandparent
+        wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
+
+        guard.remember_obsolete_node(wparent.as_ptr());
+        wparent.write_unlock_obsolete();
+        return Ok(());
+    }
+
+    // nothing to do
+    wparent.write_unlock();
+    Ok(())
+}
+
+// Allocate a new leaf node to hold 'value'. If the key is long, we
+// may need to allocate new internal nodes to hold it too
+fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
+    let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
+
+    let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
+
+    let mut node = leaf_node;
+    while prefix_off > 0 {
+        // Need another internal node
+        let remain_prefix = &key[0..prefix_off];
+
+        prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
+        let mut internal_node = node_ref::new_internal(
+            &remain_prefix[prefix_off..remain_prefix.len() - 1],
+            allocator,
+        )?;
+        internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
+        node = internal_node;
+    }
+
+    Ok(node)
+}
+
+fn common_prefix(a: &[u8], b: &[u8]) -> usize {
+    for i in 0..MAX_PREFIX_LEN {
+        if a[i] != b[i] {
+            return i;
+        }
+    }
+    panic!("prefixes are equal");
+}
--- a/libs/neonart/src/algorithm/lock_and_version.rs
+++ b/libs/neonart/src/algorithm/lock_and_version.rs
@@ -0,0 +1,117 @@
+//! Each node in the tree has contains one atomic word that stores three things:
+//!
+//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
+//!        but might still be accessed by concurrent readers until the epoch expires.
+//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
+//! Bits 2-63: Version number, incremented every time the node is modified.
+//!
+//! AtomicLockAndVersion represents that.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub(crate) struct ConcurrentUpdateError();
+
+pub(crate) struct AtomicLockAndVersion {
+    inner: AtomicU64,
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn new() -> AtomicLockAndVersion {
+        AtomicLockAndVersion {
+            inner: AtomicU64::new(0),
+        }
+    }
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
+        let version = self.await_node_unlocked();
+        if is_obsolete(version) {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(version)
+    }
+
+    pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        self.read_unlock_or_restart(version)
+    }
+
+    pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        if self.inner.load(Ordering::Acquire) != version {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        &self,
+        version: u64,
+    ) -> Result<(), ConcurrentUpdateError> {
+        if self
+            .inner
+            .compare_exchange(
+                version,
+                set_locked_bit(version),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        let old = self.inner.load(Ordering::Relaxed);
+        if is_obsolete(old) || is_locked(old) {
+            return Err(ConcurrentUpdateError());
+        }
+        if self
+            .inner
+            .compare_exchange(
+                old,
+                set_locked_bit(old),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_unlock(&self) {
+        // reset locked bit and overflow into version
+        self.inner.fetch_add(2, Ordering::Release);
+    }
+
+    pub(crate) fn write_unlock_obsolete(&self) {
+        // set obsolete, reset locked, overflow into version
+        self.inner.fetch_add(3, Ordering::Release);
+    }
+
+    // Helper functions
+    fn await_node_unlocked(&self) -> u64 {
+        let mut version = self.inner.load(Ordering::Acquire);
+        while is_locked(version) {
+            // spinlock
+            std::thread::yield_now();
+            version = self.inner.load(Ordering::Acquire)
+        }
+        version
+    }
+}
+
+fn set_locked_bit(version: u64) -> u64 {
+    version + 2
+}
+
+fn is_obsolete(version: u64) -> bool {
+    (version & 1) == 1
+}
+
+fn is_locked(version: u64) -> bool {
+    (version & 2) == 2
+}
--- a/libs/neonart/src/algorithm/node_ptr.rs
+++ b/libs/neonart/src/algorithm/node_ptr.rs
--- a/libs/neonart/src/algorithm/node_ref.rs
+++ b/libs/neonart/src/algorithm/node_ref.rs
@@ -0,0 +1,349 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use super::node_ptr;
+use super::node_ptr::NodePtr;
+use crate::EpochPin;
+use crate::Value;
+use crate::algorithm::lock_and_version::AtomicLockAndVersion;
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::allocator::ArtAllocator;
+use crate::allocator::OutOfMemoryError;
+
+pub struct NodeRef<'e, V> {
+    ptr: NodePtr<V>,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V> Debug for NodeRef<'e, V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.ptr)
+    }
+}
+
+impl<'e, V: Value> NodeRef<'e, V> {
+    pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
+        NodeRef {
+            ptr: root_ptr,
+            phantom: PhantomData,
+        }
+    }
+
+    pub(crate) fn read_lock_or_restart(
+        &self,
+    ) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        let version = self.lockword().read_lock_or_restart()?;
+        Ok(ReadLockedNodeRef {
+            ptr: self.ptr,
+            version,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn write_lock_or_restart(
+        &self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.lockword().write_lock_or_restart()?;
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    fn lockword(&self) -> &AtomicLockAndVersion {
+        self.ptr.lockword()
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct ReadLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    version: u64,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        self.ptr.is_leaf()
+    }
+
+    pub(crate) fn is_full(&self) -> bool {
+        self.ptr.is_full()
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    /// Note: because we're only holding a read lock, the prefix can change concurrently.
+    /// You must be prepared to restart, if read_unlock() returns error later.
+    ///
+    /// Returns the length of the prefix, or None if it's not a match
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        self.ptr.prefix_matches(key)
+    }
+
+    pub(crate) fn find_child_or_restart(
+        &self,
+        key_byte: u8,
+    ) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_child(key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some(child_ptr) => Ok(Some(NodeRef {
+                ptr: child_ptr,
+                phantom: self.phantom,
+            })),
+        }
+    }
+
+    pub(crate) fn find_next_child_or_restart(
+        &self,
+        min_key_byte: u8,
+    ) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_next_child(min_key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some((k, child_ptr)) => Ok(Some((
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ))),
+        }
+    }
+
+    pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
+        let result = self.ptr.get_leaf_value();
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        // Extend the lifetime.
+        let result = std::ptr::from_ref(result);
+
+        Ok(result)
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.ptr
+            .lockword()
+            .upgrade_to_write_lock_or_restart(self.version)?;
+
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+
+    pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct WriteLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
+    pub(crate) fn can_shrink(&self) -> bool {
+        self.ptr.can_shrink()
+    }
+
+    pub(crate) fn num_children(&self) -> usize {
+        self.ptr.num_children()
+    }
+
+    pub(crate) fn write_unlock(mut self) {
+        self.ptr.lockword().write_unlock();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn write_unlock_obsolete(mut self) {
+        self.ptr.lockword().write_unlock_obsolete();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        self.ptr.truncate_prefix(new_prefix_len)
+    }
+
+    pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
+        self.ptr.prepend_prefix(prefix, prefix_byte)
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
+        self.ptr.get_leaf_value_mut()
+    }
+
+    pub(crate) fn grow<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.grow(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn shrink<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.shrink(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn as_ptr(&self) -> NodePtr<V> {
+        self.ptr
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        self.ptr.replace_child(key_byte, replacement);
+    }
+
+    pub(crate) fn delete_child(&mut self, key_byte: u8) {
+        self.ptr.delete_child(key_byte);
+    }
+
+    pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
+        assert_eq!(self.num_children(), 1);
+        let child_or_value = self.ptr.find_next_child(0);
+
+        match child_or_value {
+            None => panic!("could not find only child in node"),
+            Some((k, child_ptr)) => (
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ),
+        }
+    }
+}
+
+impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.lockword().write_unlock();
+        }
+    }
+}
+
+pub(crate) struct NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    ptr: NodePtr<V>,
+    allocator: &'a A,
+
+    extra_nodes: Vec<NodePtr<V>>,
+}
+
+impl<'a, V, A> NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
+        self.ptr.insert_child(key_byte, child.as_ptr())
+    }
+
+    pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
+        let ptr = self.ptr;
+        self.ptr = NodePtr::null();
+        ptr
+    }
+
+    pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
+        let child_ptr = child.into_ptr();
+        self.ptr.insert_child(key_byte, child_ptr);
+        self.extra_nodes.push(child_ptr);
+    }
+}
+
+impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    /// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.deallocate(self.allocator);
+            for p in self.extra_nodes.iter() {
+                p.deallocate(self.allocator);
+            }
+        }
+    }
+}
+
+pub(crate) fn new_internal<'a, V, A>(
+    prefix: &[u8],
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_internal(prefix, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
+
+pub(crate) fn new_leaf<'a, V, A>(
+    prefix: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_leaf(prefix, value, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
--- a/libs/neonart/src/allocator.rs
+++ b/libs/neonart/src/allocator.rs
@@ -0,0 +1,156 @@
+pub mod block;
+mod multislab;
+mod slab;
+pub mod r#static;
+
+use std::alloc::Layout;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::sync::atomic::Ordering;
+
+use crate::allocator::multislab::MultiSlabAllocator;
+use crate::allocator::r#static::alloc_from_slice;
+
+use spin;
+
+use crate::Tree;
+pub use crate::algorithm::node_ptr::{
+    NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
+};
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub trait ArtAllocator<V: crate::Value> {
+    fn alloc_tree(&self) -> *mut Tree<V>;
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
+}
+
+pub struct ArtMultiSlabAllocator<'t, V>
+where
+    V: crate::Value,
+{
+    tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
+
+    pub(crate) inner: MultiSlabAllocator<'t, 5>,
+
+    phantom_val: PhantomData<V>,
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    const LAYOUTS: [Layout; 5] = [
+        Layout::new::<NodeInternal4<V>>(),
+        Layout::new::<NodeInternal16<V>>(),
+        Layout::new::<NodeInternal48<V>>(),
+        Layout::new::<NodeInternal256<V>>(),
+        Layout::new::<NodeLeaf<V>>(),
+    ];
+
+    pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
+        let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
+        let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
+
+        allocator_area.write(ArtMultiSlabAllocator {
+            tree_area: spin::Mutex::new(Some(tree_area)),
+            inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
+            phantom_val: PhantomData,
+        })
+    }
+}
+
+impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
+    fn alloc_tree(&self) -> *mut Tree<V> {
+        let mut t = self.tree_area.lock();
+        if let Some(tree_area) = t.take() {
+            return tree_area.as_mut_ptr().cast();
+        }
+        panic!("cannot allocate more than one tree");
+    }
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
+        self.inner.alloc_slab(0).cast()
+    }
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
+        self.inner.alloc_slab(1).cast()
+    }
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
+        self.inner.alloc_slab(2).cast()
+    }
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
+        self.inner.alloc_slab(3).cast()
+    }
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
+        self.inner.alloc_slab(4).cast()
+    }
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
+        self.inner.dealloc_slab(0, ptr.cast())
+    }
+
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
+        self.inner.dealloc_slab(1, ptr.cast())
+    }
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
+        self.inner.dealloc_slab(2, ptr.cast())
+    }
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
+        self.inner.dealloc_slab(3, ptr.cast())
+    }
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
+        self.inner.dealloc_slab(4, ptr.cast())
+    }
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
+        ArtMultiSlabStats {
+            num_internal4: self.inner.slab_descs[0]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal16: self.inner.slab_descs[1]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal48: self.inner.slab_descs[2]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal256: self.inner.slab_descs[3]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_leaf: self.inner.slab_descs[4]
+                .num_allocated
+                .load(Ordering::Relaxed),
+
+            num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
+            num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtMultiSlabStats {
+    pub num_internal4: u64,
+    pub num_internal16: u64,
+    pub num_internal48: u64,
+    pub num_internal256: u64,
+    pub num_leaf: u64,
+
+    pub num_blocks_internal4: u64,
+    pub num_blocks_internal16: u64,
+    pub num_blocks_internal48: u64,
+    pub num_blocks_internal256: u64,
+    pub num_blocks_leaf: u64,
+}
--- a/libs/neonart/src/allocator/block.rs
+++ b/libs/neonart/src/allocator/block.rs
@@ -0,0 +1,191 @@
+//! Simple allocator of fixed-size blocks
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use spin;
+
+pub const BLOCK_SIZE: usize = 16 * 1024;
+
+const INVALID_BLOCK: u64 = u64::MAX;
+
+pub(crate) struct BlockAllocator<'t> {
+    blocks_ptr: &'t [MaybeUninit<u8>],
+    num_blocks: u64,
+    num_initialized: AtomicU64,
+
+    freelist_head: spin::Mutex<u64>,
+}
+
+struct FreeListBlock {
+    inner: spin::Mutex<FreeListBlockInner>,
+}
+
+struct FreeListBlockInner {
+    next: u64,
+
+    num_free_blocks: u64,
+    free_blocks: [u64; 100], // FIXME: fill the rest of the block
+}
+
+impl<'t> BlockAllocator<'t> {
+    pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
+        // Use all the space for the blocks
+        let padding = area.as_ptr().align_offset(BLOCK_SIZE);
+        let remain = &mut area[padding..];
+
+        let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
+
+        BlockAllocator {
+            blocks_ptr: remain,
+            num_blocks,
+            num_initialized: AtomicU64::new(0),
+            freelist_head: spin::Mutex::new(INVALID_BLOCK),
+        }
+    }
+
+    /// safety: you must hold a lock on the pointer to this block, otherwise it might get
+    /// reused for another kind of block
+    fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
+        let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+
+    fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
+        assert!(blkno < self.num_blocks);
+        unsafe {
+            self.blocks_ptr
+                .as_ptr()
+                .byte_offset(blkno as isize * BLOCK_SIZE as isize)
+        }
+        .cast_mut()
+        .cast()
+    }
+
+    #[allow(clippy::mut_from_ref)]
+    pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
+        // FIXME: handle OOM
+        let blkno = self.alloc_block_internal();
+        if blkno == INVALID_BLOCK {
+            panic!("out of memory");
+        }
+
+        let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
+        unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
+    }
+
+    fn alloc_block_internal(&self) -> u64 {
+        //  check the free list.
+        {
+            let mut freelist_head = self.freelist_head.lock();
+            if *freelist_head != INVALID_BLOCK {
+                let freelist_block = self.read_freelist_block(*freelist_head);
+
+                // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+                let mut g = freelist_block.inner.lock();
+
+                if g.num_free_blocks > 0 {
+                    g.num_free_blocks -= 1;
+                    let result = g.free_blocks[g.num_free_blocks as usize];
+                    return result;
+                } else {
+                    // consume the freelist block itself
+                    let result = *freelist_head;
+                    *freelist_head = g.next;
+                    // This freelist block is now unlinked and can be repurposed
+                    drop(g);
+                    return result;
+                }
+            }
+        }
+
+        // If there are some blocks left that we've never used, pick next such block
+        let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
+        while next_uninitialized < self.num_blocks {
+            match self.num_initialized.compare_exchange(
+                next_uninitialized,
+                next_uninitialized + 1,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    return next_uninitialized;
+                }
+                Err(old) => {
+                    next_uninitialized = old;
+                    continue;
+                }
+            }
+        }
+
+        // out of blocks
+        INVALID_BLOCK
+    }
+
+    // TODO: this is currently unused. The slab allocator never releases blocks
+    #[allow(dead_code)]
+    pub(crate) fn release_block(&self, block_ptr: *mut u8) {
+        let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
+        self.release_block_internal(blockno as u64);
+    }
+
+    fn release_block_internal(&self, blockno: u64) {
+        let mut freelist_head = self.freelist_head.lock();
+        if *freelist_head != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(*freelist_head);
+
+            // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+            let mut g = freelist_block.inner.lock();
+
+            let num_free_blocks = g.num_free_blocks;
+            if num_free_blocks < g.free_blocks.len() as u64 {
+                g.free_blocks[num_free_blocks as usize] = blockno;
+                g.num_free_blocks += 1;
+                return;
+            }
+        }
+
+        // Convert the block into a new freelist block
+        let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
+        let init = FreeListBlock {
+            inner: spin::Mutex::new(FreeListBlockInner {
+                next: *freelist_head,
+                num_free_blocks: 0,
+                free_blocks: [INVALID_BLOCK; 100],
+            }),
+        };
+        unsafe { (*block_ptr) = init };
+        *freelist_head = blockno;
+    }
+
+    // for debugging
+    pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
+        let mut num_free_blocks = 0;
+
+        let mut _prev_lock = None;
+        let head_lock = self.freelist_head.lock();
+        let mut next_blk = *head_lock;
+        let mut _head_lock = Some(head_lock);
+        while next_blk != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(next_blk);
+            let lock = freelist_block.inner.lock();
+            num_free_blocks += lock.num_free_blocks;
+            next_blk = lock.next;
+            _prev_lock = Some(lock); // hold the lock until we've read the next block
+            _head_lock = None;
+        }
+
+        BlockAllocatorStats {
+            num_blocks: self.num_blocks,
+            num_initialized: self.num_initialized.load(Ordering::Relaxed),
+            num_free_blocks,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct BlockAllocatorStats {
+    pub num_blocks: u64,
+    pub num_initialized: u64,
+    pub num_free_blocks: u64,
+}
--- a/libs/neonart/src/allocator/multislab.rs
+++ b/libs/neonart/src/allocator/multislab.rs
@@ -0,0 +1,33 @@
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+
+use crate::allocator::block::BlockAllocator;
+use crate::allocator::slab::SlabDesc;
+
+pub struct MultiSlabAllocator<'t, const N: usize> {
+    pub(crate) block_allocator: BlockAllocator<'t>,
+
+    pub(crate) slab_descs: [SlabDesc; N],
+}
+
+impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
+    pub(crate) fn new(
+        area: &'t mut [MaybeUninit<u8>],
+        layouts: &[Layout; N],
+    ) -> MultiSlabAllocator<'t, N> {
+        let block_allocator = BlockAllocator::new(area);
+        MultiSlabAllocator {
+            block_allocator,
+
+            slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
+        }
+    }
+
+    pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
+        self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
+    }
+
+    pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
+        self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
+    }
+}
--- a/libs/neonart/src/allocator/slab.rs
+++ b/libs/neonart/src/allocator/slab.rs
@@ -0,0 +1,433 @@
+//! A slab allocator that carves out fixed-size chunks from larger blocks.
+//!
+//!
+
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+
+use spin;
+
+use super::alloc_from_slice;
+use super::block::BlockAllocator;
+
+use crate::allocator::block::BLOCK_SIZE;
+
+pub(crate) struct SlabDesc {
+    pub(crate) layout: Layout,
+
+    block_lists: spin::RwLock<BlockLists>,
+
+    pub(crate) num_blocks: AtomicU64,
+    pub(crate) num_allocated: AtomicU64,
+}
+
+// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
+// 'block_lists' contains pointers when it's not empty. In the current use as part of the
+// the art tree, SlabDescs are only moved during initialization.
+unsafe impl Sync for SlabDesc {}
+unsafe impl Send for SlabDesc {}
+
+#[derive(Default, Debug)]
+struct BlockLists {
+    full_blocks: BlockList,
+    nonfull_blocks: BlockList,
+}
+
+impl BlockLists {
+    // Unlink a node. It must be in either one of the two lists.
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        let list = unsafe {
+            if (*elem).next.is_null() {
+                if self.full_blocks.tail == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else if (*elem).prev.is_null() {
+                if self.full_blocks.head == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else {
+                None
+            }
+        };
+        unsafe { unlink_slab_block(list, elem) };
+    }
+}
+
+unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
+    unsafe {
+        if (*elem).next.is_null() {
+            assert_eq!(list.as_ref().unwrap().tail, elem);
+            list.as_mut().unwrap().tail = (*elem).prev;
+        } else {
+            assert_eq!((*(*elem).next).prev, elem);
+            (*(*elem).next).prev = (*elem).prev;
+        }
+        if (*elem).prev.is_null() {
+            assert_eq!(list.as_ref().unwrap().head, elem);
+            list.as_mut().unwrap().head = (*elem).next;
+        } else {
+            assert_eq!((*(*elem).prev).next, elem);
+            (*(*elem).prev).next = (*elem).next;
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BlockList {
+    head: *mut SlabBlockHeader,
+    tail: *mut SlabBlockHeader,
+}
+
+impl Default for BlockList {
+    fn default() -> Self {
+        BlockList {
+            head: std::ptr::null_mut(),
+            tail: std::ptr::null_mut(),
+        }
+    }
+}
+
+impl BlockList {
+    unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe {
+            if self.is_empty() {
+                self.tail = elem;
+                (*elem).next = std::ptr::null_mut();
+            } else {
+                (*elem).next = self.head;
+                (*self.head).prev = elem;
+            }
+            (*elem).prev = std::ptr::null_mut();
+            self.head = elem;
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head.is_null()
+    }
+
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe { unlink_slab_block(Some(self), elem) }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        let mut next = self.head;
+
+        while !next.is_null() {
+            let n = unsafe { next.as_ref() }.unwrap();
+            eprintln!(
+                "  blk {:?} (free {}/{})",
+                next,
+                n.num_free_chunks.load(Ordering::Relaxed),
+                n.num_chunks
+            );
+            next = n.next;
+        }
+    }
+}
+
+impl SlabDesc {
+    pub(crate) fn new(layout: &Layout) -> SlabDesc {
+        SlabDesc {
+            layout: *layout,
+            block_lists: spin::RwLock::new(BlockLists::default()),
+            num_allocated: AtomicU64::new(0),
+            num_blocks: AtomicU64::new(0),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SlabBlockHeader {
+    free_chunks_head: spin::Mutex<*mut FreeChunk>,
+    num_free_chunks: AtomicU32,
+    num_chunks: u32, // this is really a constant for a given Layout
+
+    // these fields are protected by the lock on the BlockLists
+    prev: *mut SlabBlockHeader,
+    next: *mut SlabBlockHeader,
+}
+
+struct FreeChunk {
+    next: *mut FreeChunk,
+}
+
+enum ReadOrWriteGuard<'a, T> {
+    Read(spin::RwLockReadGuard<'a, T>),
+    Write(spin::RwLockWriteGuard<'a, T>),
+}
+
+impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &<Self as Deref>::Target {
+        match self {
+            ReadOrWriteGuard::Read(g) => g.deref(),
+            ReadOrWriteGuard::Write(g) => g.deref(),
+        }
+    }
+}
+
+impl SlabDesc {
+    pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
+        // Are there any free chunks?
+        let mut acquire_write = false;
+        'outer: loop {
+            let mut block_lists_guard = if acquire_write {
+                ReadOrWriteGuard::Write(self.block_lists.write())
+            } else {
+                ReadOrWriteGuard::Read(self.block_lists.read())
+            };
+            'inner: loop {
+                let block_ptr = block_lists_guard.nonfull_blocks.head;
+                if block_ptr.is_null() {
+                    break 'outer;
+                }
+                unsafe {
+                    let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+                    if !(*free_chunks_head).is_null() {
+                        let result = *free_chunks_head;
+                        (*free_chunks_head) = (*result).next;
+                        let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
+
+                        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+                        return result.cast();
+                    }
+                }
+
+                // The block at the head of the list was full. Grab write lock and retry
+                match block_lists_guard {
+                    ReadOrWriteGuard::Read(_) => {
+                        acquire_write = true;
+                        continue 'outer;
+                    }
+                    ReadOrWriteGuard::Write(ref mut g) => {
+                        // move the node to the list of full blocks
+                        unsafe {
+                            g.nonfull_blocks.unlink(block_ptr);
+                            g.full_blocks.push_head(block_ptr);
+                        };
+                        continue 'inner;
+                    }
+                }
+            }
+        }
+
+        // no free chunks. Allocate a new block (and the chunk from that)
+        let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
+        self.num_blocks.fetch_add(1, Ordering::Relaxed);
+
+        // Add the block to the list in the SlabDesc
+        unsafe {
+            let mut block_lists_guard = self.block_lists.write();
+            block_lists_guard.nonfull_blocks.push_head(new_block);
+        }
+        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+        new_chunk
+    }
+
+    pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
+        // Find the block it belongs to. You can find the block from the address. (And knowing the
+        // layout, you could calculate the chunk number too.)
+        let block_ptr: *mut SlabBlockHeader = {
+            let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
+            chunk_ptr.with_addr(block_addr).cast()
+        };
+        let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
+
+        // Mark the chunk as free in 'freechunks' list
+        let num_chunks;
+        let num_free_chunks;
+        unsafe {
+            let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+            (*chunk_ptr).next = *free_chunks_head;
+            *free_chunks_head = chunk_ptr;
+
+            num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
+            num_chunks = (*block_ptr).num_chunks;
+        }
+
+        if num_free_chunks == 1 {
+            // If the block was full previously, add it to the nonfull blocks list. Note that
+            // we're not holding the lock anymore, so it can immediately become full again.
+            // That's harmless, it will be moved back to the full list again when a call
+            // to alloc_chunk() sees it.
+            let mut block_lists = self.block_lists.write();
+            unsafe {
+                block_lists.unlink(block_ptr);
+                block_lists.nonfull_blocks.push_head(block_ptr);
+            };
+        } else if num_free_chunks == num_chunks {
+            // If the block became completely empty, move it to the free list
+            // TODO
+            // FIXME: we're still holding the spinlock. It's not exactly safe to return it to
+            // the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
+            //block_allocator.release_block()
+        }
+
+        // update stats
+        self.num_allocated.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    fn alloc_block_and_chunk(
+        &self,
+        block_allocator: &BlockAllocator,
+    ) -> (*mut SlabBlockHeader, *mut u8) {
+        // fixme: handle OOM
+        let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
+        let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
+
+        let padding = remain.as_ptr().align_offset(self.layout.align());
+
+        let num_chunks = (remain.len() - padding) / self.layout.size();
+
+        let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
+
+        unsafe {
+            let mut chunk_ptr = first_chunk_ptr;
+            for _ in 0..num_chunks - 1 {
+                let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
+                (*chunk_ptr).next = next_chunk_ptr;
+                chunk_ptr = next_chunk_ptr;
+            }
+            (*chunk_ptr).next = std::ptr::null_mut();
+
+            let result_chunk = first_chunk_ptr;
+
+            let block_header = block_header.write(SlabBlockHeader {
+                free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
+                prev: std::ptr::null_mut(),
+                next: std::ptr::null_mut(),
+                num_chunks: num_chunks as u32,
+                num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
+            });
+
+            (block_header, result_chunk.cast())
+        }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        eprintln!(
+            "slab dump ({} blocks, {} allocated chunks)",
+            self.num_blocks.load(Ordering::Relaxed),
+            self.num_allocated.load(Ordering::Relaxed)
+        );
+        let lists = self.block_lists.read();
+
+        eprintln!("nonfull blocks:");
+        lists.nonfull_blocks.dump();
+        eprintln!("full blocks:");
+        lists.full_blocks.dump();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use rand::Rng;
+    use rand_distr::Zipf;
+
+    struct TestObject {
+        val: usize,
+        _dummy: [u8; BLOCK_SIZE / 4],
+    }
+
+    struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
+    impl<'a> TestObjectSlab<'a> {
+        fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
+            TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
+        }
+
+        fn alloc(&self, val: usize) -> *mut TestObject {
+            let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
+            unsafe { (*obj).val = val };
+            obj
+        }
+
+        fn dealloc(&self, obj: *mut TestObject) {
+            self.0.dealloc_chunk(obj.cast(), &self.1)
+        }
+    }
+
+    #[test]
+    fn test_slab_alloc() {
+        const MEM_SIZE: usize = 100000000;
+        let mut area = Box::new_uninit_slice(MEM_SIZE);
+        let block_allocator = BlockAllocator::new(&mut area);
+
+        let slab = TestObjectSlab::new(block_allocator);
+
+        let mut all: Vec<*mut TestObject> = Vec::new();
+        for i in 0..11 {
+            all.push(slab.alloc(i));
+        }
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..11 {
+            assert!(unsafe { (*all[i]).val == i });
+        }
+
+        let distribution = Zipf::new(10.0, 1.1).unwrap();
+        let mut rng = rand::rng();
+        for _ in 0..100000 {
+            slab.0.dump();
+            let idx = rng.sample(distribution) as usize;
+            let ptr: *mut TestObject = all[idx];
+            if !ptr.is_null() {
+                assert_eq!(unsafe { (*ptr).val }, idx);
+                slab.dealloc(ptr);
+                all[idx] = std::ptr::null_mut();
+            } else {
+                all[idx] = slab.alloc(idx);
+            }
+        }
+    }
+
+    fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
+        Box::into_raw(Box::new(SlabBlockHeader {
+            free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
+            num_free_chunks: AtomicU32::new(0),
+            num_chunks: i,
+            prev: std::ptr::null_mut(),
+            next: std::ptr::null_mut(),
+        }))
+    }
+
+    #[test]
+    fn test_block_linked_list() {
+        // note: these are leaked, but that's OK for tests
+        let a = new_test_blk(0);
+        let b = new_test_blk(1);
+
+        let mut list = BlockList::default();
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(a);
+            assert!(!list.is_empty());
+            list.unlink(a);
+        }
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(b);
+            list.push_head(a);
+            assert_eq!(list.head, a);
+            assert_eq!((*a).next, b);
+            assert_eq!((*b).prev, a);
+            assert_eq!(list.tail, b);
+
+            list.unlink(a);
+            list.unlink(b);
+            assert!(list.is_empty());
+        }
+    }
+}
--- a/libs/neonart/src/allocator/static.rs
+++ b/libs/neonart/src/allocator/static.rs
@@ -0,0 +1,44 @@
+use std::mem::MaybeUninit;
+
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/libs/neonart/src/epoch.rs
+++ b/libs/neonart/src/epoch.rs
@@ -0,0 +1,142 @@
+//! This is similar to crossbeam_epoch crate, but works in shared memory
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use crossbeam_utils::CachePadded;
+
+const NUM_SLOTS: usize = 1000;
+
+/// This is the struct that is stored in shmem
+///
+/// bit 0: is it pinned or not?
+/// rest of the bits are the epoch counter.
+pub struct EpochShared {
+    global_epoch: AtomicU64,
+    participants: [CachePadded<AtomicU64>; NUM_SLOTS],
+
+    broadcast_lock: spin::Mutex<()>,
+}
+
+impl EpochShared {
+    pub fn new() -> EpochShared {
+        EpochShared {
+            global_epoch: AtomicU64::new(2),
+            participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
+            broadcast_lock: spin::Mutex::new(()),
+        }
+    }
+
+    pub fn register(&self) -> LocalHandle {
+        LocalHandle {
+            global: self,
+            last_slot: AtomicUsize::new(0), // todo: choose more intelligently
+        }
+    }
+
+    fn release_pin(&self, slot: usize, _epoch: u64) {
+        let global_epoch = self.global_epoch.load(Ordering::Relaxed);
+        self.participants[slot].store(global_epoch, Ordering::Relaxed);
+    }
+
+    fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
+        // pick a slot
+        let mut slot = slot_hint;
+        let epoch = loop {
+            let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
+            if old & 1 == 0 {
+                // Got this slot
+                break old;
+            }
+
+            // the slot was busy by another thread / process. try a different slot
+            slot += 1;
+            if slot == NUM_SLOTS {
+                slot = 0;
+            }
+            continue;
+        };
+        (slot, epoch)
+    }
+
+    pub(crate) fn advance(&self) -> u64 {
+        // Advance the global epoch
+        let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
+        // Anyone that release their pin after this will update their slot.
+        old_epoch + 2
+    }
+
+    pub(crate) fn broadcast(&self) {
+        let Some(_guard) = self.broadcast_lock.try_lock() else {
+            return;
+        };
+
+        let epoch = self.global_epoch.load(Ordering::Relaxed);
+        let old_epoch = epoch.wrapping_sub(2);
+
+        // Update all free slots.
+        for i in 0..NUM_SLOTS {
+            // TODO: check result, as a sanity check. It should either be the old epoch, or pinned
+            let _ = self.participants[i].compare_exchange(
+                old_epoch,
+                epoch,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            );
+        }
+
+        // FIXME: memory fence here, since we used Relaxed?
+    }
+
+    pub(crate) fn get_oldest(&self) -> u64 {
+        // Read all slots.
+        let now = self.global_epoch.load(Ordering::Relaxed);
+        let mut oldest = now;
+        for i in 0..NUM_SLOTS {
+            let this_epoch = self.participants[i].load(Ordering::Relaxed);
+            let delta = now.wrapping_sub(this_epoch);
+            if delta > u64::MAX / 2 {
+                // this is very recent
+            } else if delta > now.wrapping_sub(oldest) {
+                oldest = this_epoch;
+            }
+        }
+        oldest
+    }
+
+    pub(crate) fn get_current(&self) -> u64 {
+        self.global_epoch.load(Ordering::Relaxed)
+    }
+}
+
+pub(crate) struct EpochPin<'e> {
+    slot: usize,
+    pub(crate) epoch: u64,
+
+    handle: &'e LocalHandle<'e>,
+}
+
+impl<'e> Drop for EpochPin<'e> {
+    fn drop(&mut self) {
+        self.handle.global.release_pin(self.slot, self.epoch);
+    }
+}
+
+pub struct LocalHandle<'g> {
+    global: &'g EpochShared,
+
+    last_slot: AtomicUsize,
+}
+
+impl<'g> LocalHandle<'g> {
+    pub fn pin(&self) -> EpochPin {
+        let (slot, epoch) = self
+            .global
+            .pin_internal(self.last_slot.load(Ordering::Relaxed));
+        self.last_slot.store(slot, Ordering::Relaxed);
+        EpochPin {
+            handle: self,
+            epoch,
+            slot,
+        }
+    }
+}
--- a/libs/neonart/src/lib.rs
+++ b/libs/neonart/src/lib.rs
@@ -0,0 +1,583 @@
+//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
+//!
+//! The data structure is described in these two papers:
+//!
+//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
+//!     The adaptive radix tree: ARTful indexing for main-memory databases.
+//!     Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
+//!     https://db.in.tum.de/~leis/papers/ART.pdf
+//!
+//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
+//!     The ART of practical synchronization.
+//!     1-8. 10.1145/2933349.2933352.
+//!     https://db.in.tum.de/~leis/papers/artsync.pdf
+//!
+//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
+//! use.
+//!
+//! The papers mention a few different variants. We have made the following choices in this
+//! implementation:
+//!
+//! - All keys have the same length
+//!
+//! - Single-value leaves.
+//!
+//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
+//!   variable length "prefix", which stores the keys of all the one-way nodes which have been
+//!   removed. However, similar to the "hybrid" approach described in the paper, each node only has
+//!   space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
+//!   create create one-way nodes to store them. (There was no particular reason for this choice,
+//!   the "hybrid" approach described in the paper might be better.)
+//!
+//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
+//!   ROWEX, which generally performs better when there is contention, but that is not important
+//!   for use and Optimisic Lock Coupling is simpler to implement.
+//!
+//! ## Requirements
+//!
+//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
+//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
+//! requirements, which is why we had to write our own. Namely:
+//!
+//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
+//!   built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
+//!   feature, which still nightly-only experimental as of this writing).
+//!
+//! - The data structure is accessed from multiple processes. Only one process updates the data
+//!   structure, but other processes perform reads. That rules out using built-in Rust locking
+//!   primitives like Mutex and RwLock, and most crates too.
+//!
+//! - Within the one process with write-access, multiple threads can perform updates concurrently.
+//!   That rules out using PostgreSQL LWLocks for the locking.
+//!
+//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
+//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
+//!
+//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
+//!   locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
+//!   read / write the same page at the same time. (Prefetching can conflict with actual reads,
+//!   however.)
+//!
+//!  - The keys in the integrated cache are 17 bytes long.
+//!
+//! ## Usage
+//!
+//! Because this is designed to be used as a Postgres shared memory data structure, initialization
+//! happens in three stages:
+//!
+//! 0. A fixed area of shared memory is allocated at postmaster startup.
+//!
+//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
+//!    other process or thread is running. It returns a TreeInitStruct, which is inherited by all
+//!    the processes through fork().
+//!
+//! 2. One process may have write-access to the struct, by calling
+//!    [TreeInitStruct::attach_writer]. (That process is the communicator process.)
+//!
+//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
+//!
+//! "Write access" means that you can insert / update / delete values in the tree.
+//!
+//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
+//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
+//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
+//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
+//! problem, the version check could be passed up to the caller, so that the caller could detect the
+//! lost updates and retry the operation.
+//!
+//! ## Implementation
+//!
+//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
+//! since there is an Internal and Leaf variant of each)
+//!
+//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
+//! node.
+//!
+//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
+//!   abstractions on top.
+//!
+//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
+//!
+//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
+//!   own abstraction for that because we need the data structure to live in a pre-allocated shared
+//!   memory segment).
+//!
+//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
+//!   immediately deallocated, but stays around for as long as concurrent readers might still have
+//!   pointers to them. This is enforced by an epoch system. This is similar to
+//!   e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
+//!   communicating over the shared memory segment.
+//!
+//! ## See also
+//!
+//! There are some existing Rust ART implementations out there, but none of them filled all
+//! the requirements:
+//!
+//! - https://github.com/XiangpengHao/congee
+//! - https://github.com/declanvk/blart
+//!
+//! ## TODO
+//!
+//! - Removing values has not been implemented
+
+mod algorithm;
+pub mod allocator;
+mod epoch;
+
+use algorithm::RootPtr;
+use algorithm::node_ptr::NodePtr;
+
+use std::collections::VecDeque;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::epoch::EpochPin;
+
+#[cfg(test)]
+mod tests;
+
+use allocator::ArtAllocator;
+pub use allocator::ArtMultiSlabAllocator;
+pub use allocator::OutOfMemoryError;
+
+/// Fixed-length key type.
+///
+pub trait Key: Debug {
+    const KEY_LEN: usize;
+
+    fn as_bytes(&self) -> &[u8];
+}
+
+/// Values stored in the tree
+///
+/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
+/// the old sticks around until all readers that might see the old value are gone.
+// fixme obsolete, no longer needs Clone
+pub trait Value {}
+
+const MAX_GARBAGE: usize = 1024;
+
+/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
+pub struct Tree<V: Value> {
+    /// For simplicity, so that we never need to grow or shrink the root, the root node is always an
+    /// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
+    /// indirection to every lookup)
+    root: RootPtr<V>,
+
+    writer_attached: AtomicBool,
+
+    epoch: epoch::EpochShared,
+}
+
+unsafe impl<V: Value + Sync> Sync for Tree<V> {}
+unsafe impl<V: Value + Send> Send for Tree<V> {}
+
+struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
+
+unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
+unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
+
+impl<V> GarbageQueue<V> {
+    fn new() -> GarbageQueue<V> {
+        GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
+        self.0.push_front((ptr, epoch));
+    }
+
+    fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
+        if let Some(back) = self.0.back() {
+            if back.1 < cutoff_epoch {
+                return Some(self.0.pop_back().unwrap().0);
+            }
+        }
+        None
+    }
+}
+
+/// Struct created at postmaster startup
+pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
+    tree: &'t Tree<V>,
+
+    allocator: &'t A,
+
+    phantom_key: PhantomData<K>,
+}
+
+/// The worker process has a reference to this. The write operations are only safe
+/// from the worker process
+pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    pub allocator: &'t A,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+
+    /// Obsolete nodes that cannot be recycled until their epoch expires.
+    garbage: spin::Mutex<GarbageQueue<V>>,
+}
+
+/// The backends have a reference to this. It cannot be used to modify the tree
+pub struct TreeReadAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
+    pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
+        let tree_ptr = allocator.alloc_tree();
+        let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
+        let init = Tree {
+            root: algorithm::new_root(allocator).expect("out of memory"),
+            writer_attached: AtomicBool::new(false),
+            epoch: epoch::EpochShared::new(),
+        };
+        unsafe { tree_ptr.write(init) };
+
+        TreeInitStruct {
+            tree: unsafe { tree_ptr.as_ref() },
+            allocator,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
+        let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
+        if previously_attached {
+            panic!("writer already attached");
+        }
+        TreeWriteAccess {
+            tree: self.tree,
+            allocator: self.allocator,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+            garbage: spin::Mutex::new(GarbageQueue::new()),
+        }
+    }
+
+    pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
+        TreeReadAccess {
+            tree: self.tree,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
+    pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
+    where
+        't: 'g,
+    {
+        TreeWriteGuard {
+            tree_writer: self,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+            created_garbage: false,
+        }
+    }
+
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+pub struct TreeReadGuard<'e, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'e Tree<V>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+}
+
+impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
+    pub fn get(&'e self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+pub struct TreeWriteGuard<'e, K, V, A>
+where
+    K: Key,
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+
+    created_garbage: bool,
+}
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    /// Get a value
+    pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
+    }
+
+    /// Insert a value
+    pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(self, key: &K) -> bool {
+        let mut result = false;
+        // FIXME: It's not clear if OOM is expected while removing. It seems
+        // not nice, but shrinking a node can OOM. Then again, we could opt
+        // to not shrink a node if we cannot allocate, to live a little longer.
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Try to remove value and return the old value.
+    pub fn remove_and_return(self, key: &K) -> Option<V>
+    where
+        V: Clone,
+    {
+        let mut old = None;
+        self.update_with_fn(key, |existing| {
+            old = existing.cloned();
+            UpdateAction::Remove
+        })
+        .expect("out of memory while removing");
+        old
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    ///
+    /// The function is passed a reference to the existing value, if any. If the function
+    /// returns None, the value is removed from the tree (or if there was no existing value,
+    /// does nothing). If the function returns Some, the existing value is replaced, of if there
+    /// was no existing value, it is inserted. FIXME: update comment
+    pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
+
+        if self.created_garbage {
+            let _ = self.collect_garbage();
+        }
+        Ok(())
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
+        self.tree_writer
+            .garbage
+            .lock()
+            .remember_obsolete_node(ptr, self.epoch_pin.epoch);
+        self.created_garbage = true;
+    }
+
+    // returns number of nodes recycled
+    fn collect_garbage(&self) -> usize {
+        self.tree_writer.tree.epoch.advance();
+        self.tree_writer.tree.epoch.broadcast();
+
+        let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
+
+        let mut result = 0;
+        let mut garbage_queue = self.tree_writer.garbage.lock();
+        while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
+            ptr.deallocate(self.tree_writer.allocator);
+            result += 1;
+        }
+        result
+    }
+}
+
+pub struct TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    done: bool,
+    pub next_key: Vec<u8>,
+    max_key: Option<Vec<u8>>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<K> TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    pub fn new_wrapping() -> TreeIterator<K> {
+        TreeIterator {
+            done: false,
+            next_key: vec![0; K::KEY_LEN],
+            max_key: None,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
+        let result = TreeIterator {
+            done: false,
+            next_key: Vec::from(range.start.as_bytes()),
+            max_key: Some(Vec::from(range.end.as_bytes())),
+            phantom_key: PhantomData,
+        };
+        assert_eq!(result.next_key.len(), K::KEY_LEN);
+        assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
+
+        result
+    }
+
+    pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
+    where
+        V: Value,
+    {
+        if self.done {
+            return None;
+        }
+
+        let mut wrapped_around = false;
+        loop {
+            assert_eq!(self.next_key.len(), K::KEY_LEN);
+            if let Some((k, v)) =
+                algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
+            {
+                assert_eq!(k.len(), K::KEY_LEN);
+                assert_eq!(self.next_key.len(), K::KEY_LEN);
+
+                // Check if we reached the end of the range
+                if let Some(max_key) = &self.max_key {
+                    if k.as_slice() >= max_key.as_slice() {
+                        self.done = true;
+                        break None;
+                    }
+                }
+
+                // increment the key
+                self.next_key = k.clone();
+                increment_key(self.next_key.as_mut_slice());
+                let k = k.as_slice().into();
+
+                break Some((k, v));
+            } else {
+                if self.max_key.is_some() {
+                    self.done = true;
+                } else {
+                    // Start from beginning
+                    if !wrapped_around {
+                        for i in 0..K::KEY_LEN {
+                            self.next_key[i] = 0;
+                        }
+                        wrapped_around = true;
+                        continue;
+                    } else {
+                        // The tree is completely empty
+                        // FIXME: perhaps we should remember the starting point instead.
+                        // Currently this will scan some ranges twice.
+                        break None;
+                    }
+                }
+                break None;
+            }
+        }
+    }
+}
+
+fn increment_key(key: &mut [u8]) -> bool {
+    for i in (0..key.len()).rev() {
+        let (byte, overflow) = key[i].overflowing_add(1);
+        key[i] = byte;
+        if !overflow {
+            return false;
+        }
+    }
+    true
+}
+
+// Debugging functions
+impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
+    pub fn get_statistics(&self) -> ArtTreeStatistics {
+        self.allocator.get_statistics();
+        ArtTreeStatistics {
+            blocks: self.allocator.inner.block_allocator.get_statistics(),
+            slabs: self.allocator.get_statistics(),
+            epoch: self.tree.epoch.get_current(),
+            oldest_epoch: self.tree.epoch.get_oldest(),
+            num_garbage: self.garbage.lock().0.len() as u64,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtTreeStatistics {
+    pub blocks: allocator::block::BlockAllocatorStats,
+    pub slabs: allocator::ArtMultiSlabStats,
+
+    pub epoch: u64,
+    pub oldest_epoch: u64,
+    pub num_garbage: u64,
+}
--- a/libs/neonart/src/tests.rs
+++ b/libs/neonart/src/tests.rs
@@ -0,0 +1,236 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::ArtAllocator;
+use crate::ArtMultiSlabAllocator;
+use crate::TreeInitStruct;
+use crate::TreeIterator;
+use crate::TreeWriteAccess;
+use crate::UpdateAction;
+
+use crate::{Key, Value};
+
+use rand::Rng;
+use rand::seq::SliceRandom;
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl TestKey {
+    const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
+    const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
+}
+
+impl Key for TestKey {
+    const KEY_LEN: usize = TEST_KEY_LEN;
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+impl Value for usize {}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let w = tree_writer.start_write();
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let r = tree_writer.start_read();
+        let value = r.get(&(*k).into());
+        assert_eq!(value, Some(idx).as_ref());
+    }
+
+    eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Value for TestValue {}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op<A: ArtAllocator<TestValue>>(
+    op: &TestOp,
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    let w = tree.start_write();
+    w.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+fn test_iter<A: ArtAllocator<TestValue>>(
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &BTreeMap<TestKey, usize>,
+) {
+    let mut shadow_iter = shadow.iter();
+    let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
+
+    loop {
+        let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
+        let r = tree.start_read();
+        let item = iter.next(&r);
+
+        if shadow_item != item.map(|(k, v)| (k, v.load())) {
+            eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+            tree.start_read().dump(&mut std::io::stderr());
+
+            eprintln!("SHADOW:");
+            for si in shadow {
+                eprintln!("key: {:?}, val: {}", si.0, si.1);
+            }
+            panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+        }
+        if item.is_none() {
+            break;
+        }
+    }
+}
+
+#[test]
+fn random_ops() {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let mut key: TestKey = (rng.sample(distribution) as u128).into();
+
+        if rng.random_bool(0.10) {
+            key = TestKey::from(u128::from(&key) | 0xffffffff);
+        }
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &tree_writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            eprintln!("stats: {:?}", tree_writer.get_statistics());
+            test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -17,7 +17,9 @@ anyhow.workspace = true
 bytes.workspace = true
 byteorder.workspace = true
 utils.workspace = true
-postgres_ffi.workspace = true
+postgres_ffi_types.workspace = true
+postgres_versioninfo.workspace = true
+posthog_client_lite.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
@@ -28,12 +30,13 @@ humantime-serde.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
 storage_broker.workspace = true
-camino = {workspace = true, features = ["serde1"]}
+camino = { workspace = true, features = ["serde1"] }
 remote_storage.workspace = true
 postgres_backend.workspace = true
-nix = {workspace = true, optional = true}
+nix = { workspace = true, optional = true }
 reqwest.workspace = true
 rand.workspace = true
+tracing.workspace = true
 tracing-utils.workspace = true
 once_cell.workspace = true

--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -4,6 +4,7 @@ use camino::Utf8PathBuf;
 mod tests;

 use const_format::formatcp;
+use posthog_client_lite::PostHogClientConfig;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
@@ -12,6 +13,7 @@ pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LI
 pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051

 use std::collections::HashMap;
+use std::fmt::Display;
 use std::num::{NonZeroU64, NonZeroUsize};
 use std::str::FromStr;
 use std::time::Duration;
@@ -24,16 +26,17 @@ use utils::logging::LogFormat;
 use crate::models::{ImageCompressionAlgorithm, LsnLease};

 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
-// as a separate structure.  This information is not neeed by the pageserver
+// as a separate structure.  This information is not needed by the pageserver
 // itself, it is only used for registering the pageserver with the control
 // plane and/or storage controller.
-//
 #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
 pub struct NodeMetadata {
    #[serde(rename = "host")]
    pub postgres_host: String,
    #[serde(rename = "port")]
    pub postgres_port: u16,
+    pub grpc_host: Option<String>,
+    pub grpc_port: Option<u16>,
    pub http_host: String,
    pub http_port: u16,
    pub https_port: Option<u16>,
@@ -44,19 +47,81 @@ pub struct NodeMetadata {
    pub other: HashMap<String, serde_json::Value>,
 }

-/// PostHog integration config.
+impl Display for NodeMetadata {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "postgresql://{}:{} ",
+            self.postgres_host, self.postgres_port
+        )?;
+        if let Some(grpc_host) = &self.grpc_host {
+            let grpc_port = self.grpc_port.unwrap_or_default();
+            write!(f, "grpc://{grpc_host}:{grpc_port} ")?;
+        }
+        write!(f, "http://{}:{} ", self.http_host, self.http_port)?;
+        write!(f, "other:{:?}", self.other)?;
+        Ok(())
+    }
+}
+
+/// PostHog integration config. This is used in pageserver, storcon, and neon_local.
+/// Ensure backward compatibility when adding new fields.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct PostHogConfig {
    /// PostHog project ID
-    pub project_id: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub project_id: Option<String>,
    /// Server-side (private) API key
-    pub server_api_key: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub server_api_key: Option<String>,
    /// Client-side (public) API key
-    pub client_api_key: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub client_api_key: Option<String>,
    /// Private API URL
-    pub private_api_url: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub private_api_url: Option<String>,
    /// Public API URL
-    pub public_api_url: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub public_api_url: Option<String>,
+    /// Refresh interval for the feature flag spec.
+    /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive
+    /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API.
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    pub refresh_interval: Option<Duration>,
+}
+
+impl PostHogConfig {
+    pub fn try_into_posthog_config(self) -> Result<PostHogClientConfig, &'static str> {
+        let Some(project_id) = self.project_id else {
+            return Err("project_id is required");
+        };
+        let Some(server_api_key) = self.server_api_key else {
+            return Err("server_api_key is required");
+        };
+        let Some(client_api_key) = self.client_api_key else {
+            return Err("client_api_key is required");
+        };
+        let Some(private_api_url) = self.private_api_url else {
+            return Err("private_api_url is required");
+        };
+        let Some(public_api_url) = self.public_api_url else {
+            return Err("public_api_url is required");
+        };
+        Ok(PostHogClientConfig {
+            project_id,
+            server_api_key,
+            client_api_key,
+            private_api_url,
+            public_api_url,
+        })
+    }
 }

 /// `pageserver.toml`
@@ -337,17 +402,26 @@ pub struct TimelineImportConfig {
 pub struct BasebackupCacheConfig {
    #[serde(with = "humantime_serde")]
    pub cleanup_period: Duration,
-    // FIXME: Support max_size_bytes.
-    // pub max_size_bytes: usize,
-    pub max_size_entries: i64,
+    /// Maximum total size of basebackup cache entries on disk in bytes.
+    /// The cache may slightly exceed this limit because we do not know
+    /// the exact size of the cache entry untill it's written to disk.
+    pub max_total_size_bytes: u64,
+    // TODO(diko): support max_entry_size_bytes.
+    // pub max_entry_size_bytes: u64,
+    pub max_size_entries: usize,
+    /// Size of the channel used to send prepare requests to the basebackup cache worker.
+    /// If exceeded, new prepare requests will be dropped.
+    pub prepare_channel_size: usize,
 }

 impl Default for BasebackupCacheConfig {
    fn default() -> Self {
        Self {
            cleanup_period: Duration::from_secs(60),
-            // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
-            max_size_entries: 1000,
+            max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
+            // max_entry_size_bytes: 16 * 1024 * 1024,   // 16 MiB
+            max_size_entries: 10000,
+            prepare_channel_size: 100,
        }
    }
 }
@@ -792,7 +866,7 @@ pub mod tenant_conf_defaults {
    // By default ingest enough WAL for two new L0 layers before checking if new image
    // image layers should be created.
    pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
-    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_ENABLED: bool = true;
    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
--- a/libs/pageserver_api/src/config/tests.rs
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -14,6 +14,8 @@ fn test_node_metadata_v1_backward_compatibilty() {
        NodeMetadata {
            postgres_host: "localhost".to_string(),
            postgres_port: 23,
+            grpc_host: None,
+            grpc_port: None,
            http_host: "localhost".to_string(),
            http_port: 42,
            https_port: None,
@@ -37,6 +39,35 @@ fn test_node_metadata_v2_backward_compatibilty() {
        NodeMetadata {
            postgres_host: "localhost".to_string(),
            postgres_port: 23,
+            grpc_host: None,
+            grpc_port: None,
+            http_host: "localhost".to_string(),
+            http_port: 42,
+            https_port: Some(123),
+            other: HashMap::new(),
+        }
+    )
+}
+
+#[test]
+fn test_node_metadata_v3_backward_compatibilty() {
+    let v3 = serde_json::to_vec(&serde_json::json!({
+        "host": "localhost",
+        "port": 23,
+        "grpc_host": "localhost",
+        "grpc_port": 51,
+        "http_host": "localhost",
+        "http_port": 42,
+        "https_port": 123,
+    }));
+
+    assert_eq!(
+        serde_json::from_slice::<NodeMetadata>(&v3.unwrap()).unwrap(),
+        NodeMetadata {
+            postgres_host: "localhost".to_string(),
+            postgres_port: 23,
+            grpc_host: Some("localhost".to_string()),
+            grpc_port: Some(51),
            http_host: "localhost".to_string(),
            http_port: 42,
            https_port: Some(123),
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -52,6 +52,8 @@ pub struct NodeRegisterRequest {

    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
+    pub listen_grpc_addr: Option<String>,
+    pub listen_grpc_port: Option<u16>,

    pub listen_http_addr: String,
    pub listen_http_port: u16,
@@ -101,6 +103,8 @@ pub struct TenantLocateResponseShard {

    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
+    pub listen_grpc_addr: Option<String>,
+    pub listen_grpc_port: Option<u16>,

    pub listen_http_addr: String,
    pub listen_http_port: u16,
@@ -152,6 +156,8 @@ pub struct NodeDescribeResponse {

    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
+    pub listen_grpc_addr: Option<String>,
+    pub listen_grpc_port: Option<u16>,
 }

 #[derive(Serialize, Deserialize, Debug)]
@@ -540,6 +546,11 @@ pub struct TimelineImportRequest {
    pub sk_set: Vec<NodeId>,
 }

+#[derive(serde::Serialize, serde::Deserialize, Clone)]
+pub struct TimelineSafekeeperMigrateRequest {
+    pub new_sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
    use serde_json;
@@ -571,8 +582,7 @@ mod test {
        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
        assert!(
            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
+            "expect unknown field `unknown_field` error, got: {err}"
        );
    }

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -4,8 +4,8 @@ use std::ops::Range;
 use anyhow::{Result, bail};
 use byteorder::{BE, ByteOrder};
 use bytes::Bytes;
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::{Oid, RepOriginId};
+use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi_types::{Oid, RepOriginId};
 use serde::{Deserialize, Serialize};
 use utils::const_assert;

@@ -194,7 +194,7 @@ impl Key {
    /// will be rejected on the write path.
    #[allow(dead_code)]
    pub fn is_valid_key_on_write_path_strong(&self) -> bool {
-        use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
+        use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
        if !self.is_i128_representable() {
            return false;
        }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,7 +1,6 @@
 use std::ops::Range;

 use itertools::Itertools;
-use postgres_ffi::BLCKSZ;

 use crate::key::Key;
 use crate::shard::{ShardCount, ShardIdentity};
@@ -269,9 +268,13 @@ impl KeySpace {
    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
    /// in each partition.
    ///
-    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
-        // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
+    pub fn partition(
+        &self,
+        shard_identity: &ShardIdentity,
+        target_size: u64,
+        block_size: u64,
+    ) -> KeyPartitioning {
+        let target_nblocks = (target_size / block_size) as u32;

        let mut parts = Vec::new();
        let mut current_part = Vec::new();
@@ -331,8 +334,7 @@ impl KeySpace {
                    std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end);
                assert!(
                    !overlap,
-                    "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}",
-                    prev, range
+                    "Attempt to merge ovelapping keyspaces: {prev:?} overlaps {range:?}"
                );
            }

@@ -1101,7 +1103,7 @@ mod tests {
            // total range contains at least one shard-local page
            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
            if !all_nonzero {
-                eprintln!("Found a zero-length fragment: {:?}", fragments);
+                eprintln!("Found a zero-length fragment: {fragments:?}");
            }
            assert!(all_nonzero);
        } else {
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,11 +5,10 @@ pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
-pub mod record;
+pub mod pagestream_api;
 pub mod reltag;
 pub mod shard;
 /// Public API types
 pub mod upcall_api;
-pub mod value;

 pub mod config;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -5,16 +5,13 @@ pub mod utilization;
 use core::ops::Range;
 use std::collections::HashMap;
 use std::fmt::Display;
-use std::io::{BufRead, Read};
 use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize};
 use std::str::FromStr;
 use std::time::{Duration, SystemTime};

-use byteorder::{BigEndian, ReadBytesExt};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
 #[cfg(feature = "testing")]
 use camino::Utf8PathBuf;
-use postgres_ffi::BLCKSZ;
+use postgres_versioninfo::PgMajorVersion;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 pub use utilization::PageserverUtilization;
@@ -24,8 +21,9 @@ use utils::{completion, serde_system_time};

 use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
-use crate::reltag::RelTag;
-use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
+use crate::shard::{
+    DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId,
+};

 /// The state of a tenant in this pageserver.
 ///
@@ -403,7 +401,7 @@ pub enum TimelineCreateRequestMode {
        // inherits the ancestor's pg_version. Earlier code wasn't
        // using a flattened enum, so, it was an accepted field, and
        // we continue to accept it by having it here.
-        pg_version: Option<u32>,
+        pg_version: Option<PgMajorVersion>,
        #[serde(default, skip_serializing_if = "std::ops::Not::not")]
        read_only: bool,
    },
@@ -415,7 +413,7 @@ pub enum TimelineCreateRequestMode {
    Bootstrap {
        #[serde(default)]
        existing_initdb_timeline_id: Option<TimelineId>,
-        pg_version: Option<u32>,
+        pg_version: Option<PgMajorVersion>,
    },
 }

@@ -479,7 +477,7 @@ pub struct TenantShardSplitResponse {
 }

 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Clone, Copy, Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct ShardParameters {
    pub count: ShardCount,
@@ -501,6 +499,15 @@ impl Default for ShardParameters {
    }
 }

+impl From<ShardIdentity> for ShardParameters {
+    fn from(identity: ShardIdentity) -> Self {
+        Self {
+            count: identity.count,
+            stripe_size: identity.stripe_size,
+        }
+    }
+}
+
 #[derive(Debug, Default, Clone, Eq, PartialEq)]
 pub enum FieldPatch<T> {
    Upsert(T),
@@ -1187,7 +1194,7 @@ impl Display for ImageCompressionAlgorithm {
            ImageCompressionAlgorithm::Disabled => write!(f, "disabled"),
            ImageCompressionAlgorithm::Zstd { level } => {
                if let Some(level) = level {
-                    write!(f, "zstd({})", level)
+                    write!(f, "zstd({level})")
                } else {
                    write!(f, "zstd")
                }
@@ -1578,7 +1585,7 @@ pub struct TimelineInfo {
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
-    pub pg_version: u32,
+    pub pg_version: PgMajorVersion,

    pub state: TimelineState,

@@ -1907,219 +1914,6 @@ pub struct ScanDisposableKeysResponse {
    pub not_disposable_count: usize,
 }

-// Wrapped in libpq CopyData
-#[derive(PartialEq, Eq, Debug)]
-pub enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-    GetSlruSegment(PagestreamGetSlruSegmentRequest),
-    #[cfg(feature = "testing")]
-    Test(PagestreamTestRequest),
-}
-
-// Wrapped in libpq CopyData
-#[derive(Debug, strum_macros::EnumProperty)]
-pub enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-    GetSlruSegment(PagestreamGetSlruSegmentResponse),
-    #[cfg(feature = "testing")]
-    Test(PagestreamTestResponse),
-}
-
-// Keep in sync with `pagestore_client.h`
-#[repr(u8)]
-enum PagestreamFeMessageTag {
-    Exists = 0,
-    Nblocks = 1,
-    GetPage = 2,
-    DbSize = 3,
-    GetSlruSegment = 4,
-    /* future tags above this line */
-    /// For testing purposes, not available in production.
-    #[cfg(feature = "testing")]
-    Test = 99,
-}
-
-// Keep in sync with `pagestore_client.h`
-#[repr(u8)]
-enum PagestreamBeMessageTag {
-    Exists = 100,
-    Nblocks = 101,
-    GetPage = 102,
-    Error = 103,
-    DbSize = 104,
-    GetSlruSegment = 105,
-    /* future tags above this line */
-    /// For testing purposes, not available in production.
-    #[cfg(feature = "testing")]
-    Test = 199,
-}
-
-impl TryFrom<u8> for PagestreamFeMessageTag {
-    type Error = u8;
-    fn try_from(value: u8) -> Result<Self, u8> {
-        match value {
-            0 => Ok(PagestreamFeMessageTag::Exists),
-            1 => Ok(PagestreamFeMessageTag::Nblocks),
-            2 => Ok(PagestreamFeMessageTag::GetPage),
-            3 => Ok(PagestreamFeMessageTag::DbSize),
-            4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
-            #[cfg(feature = "testing")]
-            99 => Ok(PagestreamFeMessageTag::Test),
-            _ => Err(value),
-        }
-    }
-}
-
-impl TryFrom<u8> for PagestreamBeMessageTag {
-    type Error = u8;
-    fn try_from(value: u8) -> Result<Self, u8> {
-        match value {
-            100 => Ok(PagestreamBeMessageTag::Exists),
-            101 => Ok(PagestreamBeMessageTag::Nblocks),
-            102 => Ok(PagestreamBeMessageTag::GetPage),
-            103 => Ok(PagestreamBeMessageTag::Error),
-            104 => Ok(PagestreamBeMessageTag::DbSize),
-            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
-            #[cfg(feature = "testing")]
-            199 => Ok(PagestreamBeMessageTag::Test),
-            _ => Err(value),
-        }
-    }
-}
-
-// A GetPage request contains two LSN values:
-//
-// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
-// "get the latest version present". It's used by the primary server, which knows that no one else
-// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
-// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
-//
-// not_modified_since: Hint to the pageserver that the client knows that the page has not been
-// modified between 'not_modified_since' and the request LSN. It's always correct to set
-// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
-// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
-// request without waiting for 'request_lsn' to arrive.
-//
-// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
-// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
-// 'latest' was set to true. The V2 interface was added because there was no correct way for a
-// standby to request a page at a particular non-latest LSN, and also include the
-// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
-// request, if the standby knows that the page hasn't been modified since, and risk getting an error
-// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
-// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
-// difference in the responses between V1 and V2.
-//
-// V3 version of protocol adds request ID to all requests. This request ID is also included in response
-// as well as other fields from requests, which allows to verify that we receive response for our request.
-// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
-// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
-//
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub enum PagestreamProtocolVersion {
-    V2,
-    V3,
-}
-
-pub type RequestId = u64;
-
-#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamRequest {
-    pub reqid: RequestId,
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamExistsRequest {
-    pub hdr: PagestreamRequest,
-    pub rel: RelTag,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamNblocksRequest {
-    pub hdr: PagestreamRequest,
-    pub rel: RelTag,
-}
-
-#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamGetPageRequest {
-    pub hdr: PagestreamRequest,
-    pub rel: RelTag,
-    pub blkno: u32,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamDbSizeRequest {
-    pub hdr: PagestreamRequest,
-    pub dbnode: u32,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamGetSlruSegmentRequest {
-    pub hdr: PagestreamRequest,
-    pub kind: u8,
-    pub segno: u32,
-}
-
-#[derive(Debug)]
-pub struct PagestreamExistsResponse {
-    pub req: PagestreamExistsRequest,
-    pub exists: bool,
-}
-
-#[derive(Debug)]
-pub struct PagestreamNblocksResponse {
-    pub req: PagestreamNblocksRequest,
-    pub n_blocks: u32,
-}
-
-#[derive(Debug)]
-pub struct PagestreamGetPageResponse {
-    pub req: PagestreamGetPageRequest,
-    pub page: Bytes,
-}
-
-#[derive(Debug)]
-pub struct PagestreamGetSlruSegmentResponse {
-    pub req: PagestreamGetSlruSegmentRequest,
-    pub segment: Bytes,
-}
-
-#[derive(Debug)]
-pub struct PagestreamErrorResponse {
-    pub req: PagestreamRequest,
-    pub message: String,
-}
-
-#[derive(Debug)]
-pub struct PagestreamDbSizeResponse {
-    pub req: PagestreamDbSizeRequest,
-    pub db_size: i64,
-}
-
-#[cfg(feature = "testing")]
-#[derive(Debug, PartialEq, Eq, Clone)]
-pub struct PagestreamTestRequest {
-    pub hdr: PagestreamRequest,
-    pub batch_key: u64,
-    pub message: String,
-}
-
-#[cfg(feature = "testing")]
-#[derive(Debug)]
-pub struct PagestreamTestResponse {
-    pub req: PagestreamTestRequest,
-}
-
 // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
 // that require pageserver-internal types.  It is sufficient to get the total size.
 #[derive(Serialize, Deserialize, Debug)]
@@ -2131,506 +1925,6 @@ pub struct TenantHistorySize {
    pub size: Option<u64>,
 }

-impl PagestreamFeMessage {
-    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 3.
-    pub fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-            }
-
-            Self::Nblocks(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-            }
-
-            Self::GetPage(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-                bytes.put_u32(req.blkno);
-            }
-
-            Self::DbSize(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.dbnode);
-            }
-
-            Self::GetSlruSegment(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u8(req.kind);
-                bytes.put_u32(req.segno);
-            }
-            #[cfg(feature = "testing")]
-            Self::Test(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::Test as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u64(req.batch_key);
-                let message = req.message.as_bytes();
-                bytes.put_u64(message.len() as u64);
-                bytes.put_slice(message);
-            }
-        }
-
-        bytes.into()
-    }
-
-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.read_u8()?;
-        let (reqid, request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                0,
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V3 => (
-                body.read_u64::<BigEndian>()?,
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-        };
-
-        match PagestreamFeMessageTag::try_from(msg_tag)
-            .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
-        {
-            PagestreamFeMessageTag::Exists => {
-                Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    rel: RelTag {
-                        spcnode: body.read_u32::<BigEndian>()?,
-                        dbnode: body.read_u32::<BigEndian>()?,
-                        relnode: body.read_u32::<BigEndian>()?,
-                        forknum: body.read_u8()?,
-                    },
-                }))
-            }
-            PagestreamFeMessageTag::Nblocks => {
-                Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    rel: RelTag {
-                        spcnode: body.read_u32::<BigEndian>()?,
-                        dbnode: body.read_u32::<BigEndian>()?,
-                        relnode: body.read_u32::<BigEndian>()?,
-                        forknum: body.read_u8()?,
-                    },
-                }))
-            }
-            PagestreamFeMessageTag::GetPage => {
-                Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    rel: RelTag {
-                        spcnode: body.read_u32::<BigEndian>()?,
-                        dbnode: body.read_u32::<BigEndian>()?,
-                        relnode: body.read_u32::<BigEndian>()?,
-                        forknum: body.read_u8()?,
-                    },
-                    blkno: body.read_u32::<BigEndian>()?,
-                }))
-            }
-            PagestreamFeMessageTag::DbSize => {
-                Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    dbnode: body.read_u32::<BigEndian>()?,
-                }))
-            }
-            PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
-                PagestreamGetSlruSegmentRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    kind: body.read_u8()?,
-                    segno: body.read_u32::<BigEndian>()?,
-                },
-            )),
-            #[cfg(feature = "testing")]
-            PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                batch_key: body.read_u64::<BigEndian>()?,
-                message: {
-                    let len = body.read_u64::<BigEndian>()?;
-                    let mut buf = vec![0; len as usize];
-                    body.read_exact(&mut buf)?;
-                    String::from_utf8(buf)?
-                },
-            })),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        use PagestreamBeMessageTag as Tag;
-        match protocol_version {
-            PagestreamProtocolVersion::V2 => {
-                match self {
-                    Self::Exists(resp) => {
-                        bytes.put_u8(Tag::Exists as u8);
-                        bytes.put_u8(resp.exists as u8);
-                    }
-
-                    Self::Nblocks(resp) => {
-                        bytes.put_u8(Tag::Nblocks as u8);
-                        bytes.put_u32(resp.n_blocks);
-                    }
-
-                    Self::GetPage(resp) => {
-                        bytes.put_u8(Tag::GetPage as u8);
-                        bytes.put(&resp.page[..])
-                    }
-
-                    Self::Error(resp) => {
-                        bytes.put_u8(Tag::Error as u8);
-                        bytes.put(resp.message.as_bytes());
-                        bytes.put_u8(0); // null terminator
-                    }
-                    Self::DbSize(resp) => {
-                        bytes.put_u8(Tag::DbSize as u8);
-                        bytes.put_i64(resp.db_size);
-                    }
-
-                    Self::GetSlruSegment(resp) => {
-                        bytes.put_u8(Tag::GetSlruSegment as u8);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                        bytes.put(&resp.segment[..]);
-                    }
-
-                    #[cfg(feature = "testing")]
-                    Self::Test(resp) => {
-                        bytes.put_u8(Tag::Test as u8);
-                        bytes.put_u64(resp.req.batch_key);
-                        let message = resp.req.message.as_bytes();
-                        bytes.put_u64(message.len() as u64);
-                        bytes.put_slice(message);
-                    }
-                }
-            }
-            PagestreamProtocolVersion::V3 => {
-                match self {
-                    Self::Exists(resp) => {
-                        bytes.put_u8(Tag::Exists as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u8(resp.exists as u8);
-                    }
-
-                    Self::Nblocks(resp) => {
-                        bytes.put_u8(Tag::Nblocks as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u32(resp.n_blocks);
-                    }
-
-                    Self::GetPage(resp) => {
-                        bytes.put_u8(Tag::GetPage as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u32(resp.req.blkno);
-                        bytes.put(&resp.page[..])
-                    }
-
-                    Self::Error(resp) => {
-                        bytes.put_u8(Tag::Error as u8);
-                        bytes.put_u64(resp.req.reqid);
-                        bytes.put_u64(resp.req.request_lsn.0);
-                        bytes.put_u64(resp.req.not_modified_since.0);
-                        bytes.put(resp.message.as_bytes());
-                        bytes.put_u8(0); // null terminator
-                    }
-                    Self::DbSize(resp) => {
-                        bytes.put_u8(Tag::DbSize as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.dbnode);
-                        bytes.put_i64(resp.db_size);
-                    }
-
-                    Self::GetSlruSegment(resp) => {
-                        bytes.put_u8(Tag::GetSlruSegment as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u8(resp.req.kind);
-                        bytes.put_u32(resp.req.segno);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                        bytes.put(&resp.segment[..]);
-                    }
-
-                    #[cfg(feature = "testing")]
-                    Self::Test(resp) => {
-                        bytes.put_u8(Tag::Test as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u64(resp.req.batch_key);
-                        let message = resp.req.message.as_bytes();
-                        bytes.put_u64(message.len() as u64);
-                        bytes.put_slice(message);
-                    }
-                }
-            }
-        }
-        bytes.into()
-    }
-
-    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
-        let mut buf = buf.reader();
-        let msg_tag = buf.read_u8()?;
-
-        use PagestreamBeMessageTag as Tag;
-        let ok =
-            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
-                Tag::Exists => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let exists = buf.read_u8()? != 0;
-                    Self::Exists(PagestreamExistsResponse {
-                        req: PagestreamExistsRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                        },
-                        exists,
-                    })
-                }
-                Tag::Nblocks => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let n_blocks = buf.read_u32::<BigEndian>()?;
-                    Self::Nblocks(PagestreamNblocksResponse {
-                        req: PagestreamNblocksRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                        },
-                        n_blocks,
-                    })
-                }
-                Tag::GetPage => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let blkno = buf.read_u32::<BigEndian>()?;
-                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
-                    buf.read_exact(&mut page)?;
-                    Self::GetPage(PagestreamGetPageResponse {
-                        req: PagestreamGetPageRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                            blkno,
-                        },
-                        page: page.into(),
-                    })
-                }
-                Tag::Error => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let mut msg = Vec::new();
-                    buf.read_until(0, &mut msg)?;
-                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
-                    let rust_str = cstring.to_str()?;
-                    Self::Error(PagestreamErrorResponse {
-                        req: PagestreamRequest {
-                            reqid,
-                            request_lsn,
-                            not_modified_since,
-                        },
-                        message: rust_str.to_owned(),
-                    })
-                }
-                Tag::DbSize => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let dbnode = buf.read_u32::<BigEndian>()?;
-                    let db_size = buf.read_i64::<BigEndian>()?;
-                    Self::DbSize(PagestreamDbSizeResponse {
-                        req: PagestreamDbSizeRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            dbnode,
-                        },
-                        db_size,
-                    })
-                }
-                Tag::GetSlruSegment => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let kind = buf.read_u8()?;
-                    let segno = buf.read_u32::<BigEndian>()?;
-                    let n_blocks = buf.read_u32::<BigEndian>()?;
-                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
-                    buf.read_exact(&mut segment)?;
-                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
-                        req: PagestreamGetSlruSegmentRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            kind,
-                            segno,
-                        },
-                        segment: segment.into(),
-                    })
-                }
-                #[cfg(feature = "testing")]
-                Tag::Test => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let batch_key = buf.read_u64::<BigEndian>()?;
-                    let len = buf.read_u64::<BigEndian>()?;
-                    let mut msg = vec![0; len as usize];
-                    buf.read_exact(&mut msg)?;
-                    let message = String::from_utf8(msg)?;
-                    Self::Test(PagestreamTestResponse {
-                        req: PagestreamTestRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            batch_key,
-                            message,
-                        },
-                    })
-                }
-            };
-        let remaining = buf.into_inner();
-        if !remaining.is_empty() {
-            anyhow::bail!(
-                "remaining bytes in msg with tag={msg_tag}: {}",
-                remaining.len()
-            );
-        }
-        Ok(ok)
-    }
-
-    pub fn kind(&self) -> &'static str {
-        match self {
-            Self::Exists(_) => "Exists",
-            Self::Nblocks(_) => "Nblocks",
-            Self::GetPage(_) => "GetPage",
-            Self::Error(_) => "Error",
-            Self::DbSize(_) => "DbSize",
-            Self::GetSlruSegment(_) => "GetSlruSegment",
-            #[cfg(feature = "testing")]
-            Self::Test(_) => "Test",
-        }
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct PageTraceEvent {
    pub key: CompactKey,
@@ -2656,68 +1950,6 @@ mod tests {

    use super::*;

-    #[test]
-    fn test_pagestream() {
-        // Test serialization/deserialization of PagestreamFeMessage
-        let messages = vec![
-            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-            }),
-            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(4),
-                },
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-            }),
-            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-                blkno: 7,
-            }),
-            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
-                dbnode: 7,
-            }),
-        ];
-        for msg in messages {
-            let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
-                    .unwrap();
-            assert!(msg == reconstructed);
-        }
-    }
-
    #[test]
    fn test_tenantinfo_serde() {
        // Test serialization/deserialization of TenantInfo
@@ -2791,8 +2023,7 @@ mod tests {
        let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
        assert!(
            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
+            "expect unknown field `unknown_field` error, got: {err}"
        );
    }

--- a/libs/pageserver_api/src/pagestream_api.rs
+++ b/libs/pageserver_api/src/pagestream_api.rs
@@ -0,0 +1,798 @@
+//! Rust definitions of the libpq-based pagestream API
+//!
+//! See also the C implementation of the same API in pgxn/neon/pagestore_client.h
+
+use std::io::{BufRead, Read};
+
+use crate::reltag::RelTag;
+
+use byteorder::{BigEndian, ReadBytesExt};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use utils::lsn::Lsn;
+
+/// Block size.
+///
+/// XXX: We assume 8k block size in the SLRU fetch API. It's not great to hardcode
+/// that in the protocol, because Postgres supports different block sizes as a compile
+/// time option.
+const BLCKSZ: usize = 8192;
+
+// Wrapped in libpq CopyData
+#[derive(PartialEq, Eq, Debug)]
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+    GetSlruSegment(PagestreamGetSlruSegmentRequest),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestRequest),
+}
+
+// Wrapped in libpq CopyData
+#[derive(Debug, strum_macros::EnumProperty)]
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+    GetSlruSegment(PagestreamGetSlruSegmentResponse),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestResponse),
+}
+
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamFeMessageTag {
+    Exists = 0,
+    Nblocks = 1,
+    GetPage = 2,
+    DbSize = 3,
+    GetSlruSegment = 4,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 99,
+}
+
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamBeMessageTag {
+    Exists = 100,
+    Nblocks = 101,
+    GetPage = 102,
+    Error = 103,
+    DbSize = 104,
+    GetSlruSegment = 105,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 199,
+}
+
+impl TryFrom<u8> for PagestreamFeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            0 => Ok(PagestreamFeMessageTag::Exists),
+            1 => Ok(PagestreamFeMessageTag::Nblocks),
+            2 => Ok(PagestreamFeMessageTag::GetPage),
+            3 => Ok(PagestreamFeMessageTag::DbSize),
+            4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            99 => Ok(PagestreamFeMessageTag::Test),
+            _ => Err(value),
+        }
+    }
+}
+
+impl TryFrom<u8> for PagestreamBeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            100 => Ok(PagestreamBeMessageTag::Exists),
+            101 => Ok(PagestreamBeMessageTag::Nblocks),
+            102 => Ok(PagestreamBeMessageTag::GetPage),
+            103 => Ok(PagestreamBeMessageTag::Error),
+            104 => Ok(PagestreamBeMessageTag::DbSize),
+            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            199 => Ok(PagestreamBeMessageTag::Test),
+            _ => Err(value),
+        }
+    }
+}
+
+// A GetPage request contains two LSN values:
+//
+// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
+// "get the latest version present". It's used by the primary server, which knows that no one else
+// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
+// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
+//
+// not_modified_since: Hint to the pageserver that the client knows that the page has not been
+// modified between 'not_modified_since' and the request LSN. It's always correct to set
+// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
+// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
+// request without waiting for 'request_lsn' to arrive.
+//
+// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
+// 'latest' was set to true. The V2 interface was added because there was no correct way for a
+// standby to request a page at a particular non-latest LSN, and also include the
+// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
+// request, if the standby knows that the page hasn't been modified since, and risk getting an error
+// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
+// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
+// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
+// difference in the responses between V1 and V2.
+//
+// V3 version of protocol adds request ID to all requests. This request ID is also included in response
+// as well as other fields from requests, which allows to verify that we receive response for our request.
+// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
+// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
+//
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum PagestreamProtocolVersion {
+    V2,
+    V3,
+}
+
+pub type RequestId = u64;
+
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamRequest {
+    pub reqid: RequestId,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamExistsRequest {
+    pub hdr: PagestreamRequest,
+    pub rel: RelTag,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamNblocksRequest {
+    pub hdr: PagestreamRequest,
+    pub rel: RelTag,
+}
+
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamGetPageRequest {
+    pub hdr: PagestreamRequest,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamDbSizeRequest {
+    pub hdr: PagestreamRequest,
+    pub dbnode: u32,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamGetSlruSegmentRequest {
+    pub hdr: PagestreamRequest,
+    pub kind: u8,
+    pub segno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub req: PagestreamExistsRequest,
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub req: PagestreamNblocksRequest,
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub req: PagestreamGetPageRequest,
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetSlruSegmentResponse {
+    pub req: PagestreamGetSlruSegmentRequest,
+    pub segment: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub req: PagestreamRequest,
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub req: PagestreamDbSizeRequest,
+    pub db_size: i64,
+}
+
+#[cfg(feature = "testing")]
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PagestreamTestRequest {
+    pub hdr: PagestreamRequest,
+    pub batch_key: u64,
+    pub message: String,
+}
+
+#[cfg(feature = "testing")]
+#[derive(Debug)]
+pub struct PagestreamTestResponse {
+    pub req: PagestreamTestRequest,
+}
+
+impl PagestreamFeMessage {
+    /// Serialize a compute -> pageserver message. This is currently only used in testing
+    /// tools. Always uses protocol version 3.
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::Nblocks(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::GetPage(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+                bytes.put_u32(req.blkno);
+            }
+
+            Self::DbSize(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.dbnode);
+            }
+
+            Self::GetSlruSegment(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u8(req.kind);
+                bytes.put_u32(req.segno);
+            }
+            #[cfg(feature = "testing")]
+            Self::Test(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Test as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.batch_key);
+                let message = req.message.as_bytes();
+                bytes.put_u64(message.len() as u64);
+                bytes.put_slice(message);
+            }
+        }
+
+        bytes.into()
+    }
+
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.read_u8()?;
+        let (reqid, request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                0,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V3 => (
+                body.read_u64::<BigEndian>()?,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+        };
+
+        match PagestreamFeMessageTag::try_from(msg_tag)
+            .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
+        {
+            PagestreamFeMessageTag::Exists => {
+                Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::Nblocks => {
+                Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::GetPage => {
+                Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                    blkno: body.read_u32::<BigEndian>()?,
+                }))
+            }
+            PagestreamFeMessageTag::DbSize => {
+                Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    dbnode: body.read_u32::<BigEndian>()?,
+                }))
+            }
+            PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
+                PagestreamGetSlruSegmentRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    kind: body.read_u8()?,
+                    segno: body.read_u32::<BigEndian>()?,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
+                batch_key: body.read_u64::<BigEndian>()?,
+                message: {
+                    let len = body.read_u64::<BigEndian>()?;
+                    let mut buf = vec![0; len as usize];
+                    body.read_exact(&mut buf)?;
+                    String::from_utf8(buf)?
+                },
+            })),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        use PagestreamBeMessageTag as Tag;
+        match protocol_version {
+            PagestreamProtocolVersion::V2 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u8(resp.exists as u8);
+                    }
+
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u32(resp.n_blocks);
+                    }
+
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put(&resp.page[..])
+                    }
+
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_i64(resp.db_size);
+                    }
+
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
+                }
+            }
+            PagestreamProtocolVersion::V3 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u8(resp.exists as u8);
+                    }
+
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.n_blocks);
+                    }
+
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.req.blkno);
+                        bytes.put(&resp.page[..])
+                    }
+
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put_u64(resp.req.reqid);
+                        bytes.put_u64(resp.req.request_lsn.0);
+                        bytes.put_u64(resp.req.not_modified_since.0);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.dbnode);
+                        bytes.put_i64(resp.db_size);
+                    }
+
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u8(resp.req.kind);
+                        bytes.put_u32(resp.req.segno);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
+                }
+            }
+        }
+        bytes.into()
+    }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+
+        use PagestreamBeMessageTag as Tag;
+        let ok =
+            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
+                Tag::Exists => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let exists = buf.read_u8()? != 0;
+                    Self::Exists(PagestreamExistsResponse {
+                        req: PagestreamExistsRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        exists,
+                    })
+                }
+                Tag::Nblocks => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    Self::Nblocks(PagestreamNblocksResponse {
+                        req: PagestreamNblocksRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        n_blocks,
+                    })
+                }
+                Tag::GetPage => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let blkno = buf.read_u32::<BigEndian>()?;
+                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
+                    buf.read_exact(&mut page)?;
+                    Self::GetPage(PagestreamGetPageResponse {
+                        req: PagestreamGetPageRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                            blkno,
+                        },
+                        page: page.into(),
+                    })
+                }
+                Tag::Error => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let mut msg = Vec::new();
+                    buf.read_until(0, &mut msg)?;
+                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
+                    let rust_str = cstring.to_str()?;
+                    Self::Error(PagestreamErrorResponse {
+                        req: PagestreamRequest {
+                            reqid,
+                            request_lsn,
+                            not_modified_since,
+                        },
+                        message: rust_str.to_owned(),
+                    })
+                }
+                Tag::DbSize => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let dbnode = buf.read_u32::<BigEndian>()?;
+                    let db_size = buf.read_i64::<BigEndian>()?;
+                    Self::DbSize(PagestreamDbSizeResponse {
+                        req: PagestreamDbSizeRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            dbnode,
+                        },
+                        db_size,
+                    })
+                }
+                Tag::GetSlruSegment => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let kind = buf.read_u8()?;
+                    let segno = buf.read_u32::<BigEndian>()?;
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    let mut segment = vec![0; n_blocks as usize * BLCKSZ];
+                    buf.read_exact(&mut segment)?;
+                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
+                        req: PagestreamGetSlruSegmentRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            kind,
+                            segno,
+                        },
+                        segment: segment.into(),
+                    })
+                }
+                #[cfg(feature = "testing")]
+                Tag::Test => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let batch_key = buf.read_u64::<BigEndian>()?;
+                    let len = buf.read_u64::<BigEndian>()?;
+                    let mut msg = vec![0; len as usize];
+                    buf.read_exact(&mut msg)?;
+                    let message = String::from_utf8(msg)?;
+                    Self::Test(PagestreamTestResponse {
+                        req: PagestreamTestRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            batch_key,
+                            message,
+                        },
+                    })
+                }
+            };
+        let remaining = buf.into_inner();
+        if !remaining.is_empty() {
+            anyhow::bail!(
+                "remaining bytes in msg with tag={msg_tag}: {}",
+                remaining.len()
+            );
+        }
+        Ok(ok)
+    }
+
+    pub fn kind(&self) -> &'static str {
+        match self {
+            Self::Exists(_) => "Exists",
+            Self::Nblocks(_) => "Nblocks",
+            Self::GetPage(_) => "GetPage",
+            Self::Error(_) => "Error",
+            Self::DbSize(_) => "DbSize",
+            Self::GetSlruSegment(_) => "GetSlruSegment",
+            #[cfg(feature = "testing")]
+            Self::Test(_) => "Test",
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pagestream() {
+        // Test serialization/deserialization of PagestreamFeMessage
+        let messages = vec![
+            PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(4),
+                },
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+                blkno: 7,
+            }),
+            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
+                dbnode: 7,
+            }),
+        ];
+        for msg in messages {
+            let bytes = msg.serialize();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
+                    .unwrap();
+            assert!(msg == reconstructed);
+        }
+    }
+}
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -1,9 +1,9 @@
 use std::cmp::Ordering;
 use std::fmt;

-use postgres_ffi::Oid;
-use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name};
+use postgres_ffi_types::Oid;
+use postgres_ffi_types::constants::GLOBALTABLESPACE_OID;
+use postgres_ffi_types::forknum::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name};
 use serde::{Deserialize, Serialize};

 ///
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -35,8 +35,9 @@ use std::hash::{Hash, Hasher};

 #[doc(inline)]
 pub use ::utils::shard::*;
-use postgres_ffi::relfile_utils::INIT_FORKNUM;
+use postgres_ffi_types::forknum::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
+use utils::critical;

 use crate::key::Key;
 use crate::models::ShardParameters;
@@ -179,7 +180,7 @@ impl ShardIdentity {

    /// For use when creating ShardIdentity instances for new shards, where a creation request
    /// specifies the ShardParameters that apply to all shards.
-    pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
+    pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self {
        Self {
            number,
            count: params.count,
@@ -188,6 +189,17 @@ impl ShardIdentity {
        }
    }

+    /// Asserts that the given shard identities are equal. Changes to shard parameters will likely
+    /// result in data corruption.
+    pub fn assert_equal(&self, other: ShardIdentity) {
+        if self != &other {
+            // TODO: for now, we're conservative and just log errors in production. Turn this into a
+            // real assertion when we're confident it doesn't misfire, and also reject requests that
+            // attempt to change it with an error response.
+            critical!("shard identity mismatch: {self:?} != {other:?}");
+        }
+    }
+
    fn is_broken(&self) -> bool {
        self.layout == LAYOUT_BROKEN
    }
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -23,22 +23,12 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }

-fn default_mode() -> LocationConfigMode {
-    LocationConfigMode::AttachedSingle
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
    pub r#gen: Option<u32>,
-
-    /// Default value only for backward compat: this field should be set
-    #[serde(default = "default_mode")]
    pub mode: LocationConfigMode,
-
-    // Default value only for backward compat: this field should be set
-    #[serde(default = "ShardStripeSize::default")]
    pub stripe_size: ShardStripeSize,
 }
 #[derive(Serialize, Deserialize)]
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -939,7 +939,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
                FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail),
                FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate),
                _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol(
-                    ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)),
+                    ProtocolError::Protocol(format!("unexpected message in COPY stream {msg:?}")),
                ))),
            },
            None => Err(CopyStreamHandlerEnd::EOF),
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid";`