code for local setup of auth-broker

use cargo-chef for compute-tools
fix: additional changes to terminate pgbouncer on compute suspend (#12153 ) (#12284 )
2026-05-17 13:10:38 +00:00 · 2025-06-19 10:34:09 +01:00 · 2025-06-19 09:24:53 +01:00 · 2025-06-18 19:31:22 +00:00 · 2025-06-18 17:45:20 +00:00 · 2025-06-18 15:51:49 +00:00
176 changed files with 5330 additions and 1673 deletions
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -38,6 +38,11 @@ on:
        required: false
        default: 1
        type: number
+      rerun-failed:
+        description: 'rerun failed tests to ignore flaky tests'
+        required: false
+        default: true
+        type: boolean

 defaults:
  run:
@@ -379,7 +384,7 @@ jobs:
      - name: Pytest regression tests
        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: ${{ inputs.sanitizers != 'enabled' && 75 || 180 }}
+        timeout-minutes: ${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 75 || 180 }}
        with:
          build_type: ${{ inputs.build-type }}
          test_selection: regress
@@ -387,14 +392,14 @@ jobs:
          run_with_real_s3: true
          real_s3_bucket: neon-github-ci-tests
          real_s3_region: eu-central-1
-          rerun_failed: ${{ inputs.test-run-count == 1 }}
+          rerun_failed: ${{ inputs.rerun-failed }}
          pg_version: ${{ matrix.pg_version }}
          sanitizers: ${{ inputs.sanitizers }}
          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
          # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
          # Attempt to stop tests gracefully to generate test reports
          # until they are forcibly stopped by the stricter `timeout-minutes` limit.
-          extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
+          extra_params: --session-timeout=${{ (inputs.build-type == 'release' && inputs.sanitizers != 'enabled') && 3000 || 10200 }} --count=${{ inputs.test-run-count }}
                        ${{ inputs.test-selection != '' && format('-k "{0}"', inputs.test-selection) || '' }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
--- a/.github/workflows/build_and_run_selected_test.yml
+++ b/.github/workflows/build_and_run_selected_test.yml
@@ -58,6 +58,7 @@ jobs:
      test-cfg: ${{ inputs.pg-versions }}
      test-selection: ${{ inputs.test-selection }}
      test-run-count: ${{ fromJson(inputs.run-count) }}
+      rerun-failed: false
    secrets: inherit

  create-test-report:
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -199,6 +199,28 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
    secrets: inherit

+  validate-compute-manifest:
+    runs-on: ubuntu-22.04
+    needs: [ meta, check-permissions ]
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Set up Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
+        with:
+          node-version: '24'
+
+      - name: Validate manifest against schema
+        run: |
+          make -C compute manifest-schema-validation
+
  build-and-test-locally:
    needs: [ meta, build-build-tools-image ]
    # We do need to run this in `.*-rc-pr` because of hotfixes.
--- a/.github/workflows/build_and_test_fully.yml
+++ b/.github/workflows/build_and_test_fully.yml
@@ -0,0 +1,151 @@
+name: Build and Test Fully
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 3 * * *' # run once a day, timezone is utc
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  tag:
+    runs-on: [ self-hosted, small ]
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+
+    steps:
+      # Need `fetch-depth: 0` to count the number of commits in the branch
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Get build tag
+        run: |
+          echo run:$GITHUB_RUN_ID
+          echo ref:$GITHUB_REF_NAME
+          echo rev:$(git rev-list --count HEAD)
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
+            echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
+            echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+        id: build-tag
+
+  build-build-tools-image:
+    uses: ./.github/workflows/build-build-tools-image.yml
+    secrets: inherit
+
+  build-and-test-locally:
+    needs: [ tag, build-build-tools-image ]
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [ x64, arm64 ]
+        build-type: [ debug, release ]
+    uses: ./.github/workflows/_build-and-test-locally.yml
+    with:
+      arch: ${{ matrix.arch }}
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      build-tag: ${{ needs.tag.outputs.build-tag }}
+      build-type: ${{ matrix.build-type }}
+      rerun-failed: false
+      test-cfg: '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                  {"pg_version":"v15", "lfc_state": "with-lfc"},
+                  {"pg_version":"v16", "lfc_state": "with-lfc"},
+                  {"pg_version":"v17", "lfc_state": "with-lfc"},
+                  {"pg_version":"v14", "lfc_state": "without-lfc"},
+                  {"pg_version":"v15", "lfc_state": "without-lfc"},
+                  {"pg_version":"v16", "lfc_state": "without-lfc"},
+                  {"pg_version":"v17", "lfc_state": "withouts-lfc"}]'
+    secrets: inherit
+
+
+  create-test-report:
+    needs: [ build-and-test-locally, build-build-tools-image ]
+    if: ${{ !cancelled() }}
+    permissions:
+      id-token: write # aws-actions/configure-aws-credentials
+      statuses: write
+      contents: write
+      pull-requests: write
+    outputs:
+      report-url: ${{ steps.create-allure-report.outputs.report-url }}
+
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+      - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        if: ${{ !cancelled() }}
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
+
+            const coverage = {}
+
+            const script = require("./scripts/comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              report,
+              coverage,
+            })
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -79,6 +79,7 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
+      rerun-failed: false
      test-cfg: '[{"pg_version":"v17"}]'
      sanitizers: enabled
    secrets: inherit
--- a/.github/workflows/large_oltp_benchmark.yml
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -33,11 +33,19 @@ jobs:
      fail-fast: false # allow other variants to continue even if one fails
      matrix:
        include:
+          # test only read-only custom scripts in new branch without database maintenance
+          - target: new_branch
+            custom_scripts: select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3
+            test_maintenance: false
+          # test all custom scripts in new branch with database maintenance
          - target: new_branch
            custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
+            test_maintenance: true
+          # test all custom scripts in reuse branch with database maintenance
          - target: reuse_branch
            custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100
-      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
+            test_maintenance: true
+      max-parallel: 1 # we want to run each benchmark sequentially to not have noisy neighbors on shared storage (PS, SK)
    permissions:
      contents: write
      statuses: write
@@ -145,6 +153,7 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Benchmark database maintenance
+      if: ${{ matrix.test_maintenance == 'true' }}
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
--- a/.github/workflows/large_oltp_growth.yml
+++ b/.github/workflows/large_oltp_growth.yml
@@ -0,0 +1,175 @@
+name: large oltp growth
+# workflow to grow the reuse branch of large oltp benchmark continuously (about 16 GB per run)
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #  branches: [ bodobolero/increase_large_oltp_workload ]
+
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #        ┌───────────── minute (0 - 59)
+    #        │ ┌───────────── hour (0 - 23)
+    #        │ │  ┌───────────── day of the month (1 - 31)
+    #        │ │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #        │ │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron: '0 6 * * *'   # 06:00 UTC
+    - cron: '0 8 * * *'   # 08:00 UTC
+    - cron: '0 10 * * *'  # 10:00 UTC
+    - cron: '0 12 * * *'  # 12:00 UTC
+    - cron: '0 14 * * *'  # 14:00 UTC
+    - cron: '0 16 * * *'  # 16:00 UTC
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: large-oltp-growth
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  oltp:
+    strategy:
+      fail-fast: false # allow other variants to continue even if one fails
+      matrix:
+        include:
+          # for now only grow the reuse branch, not the other branches.
+          - target: reuse_branch
+            custom_scripts:
+            - grow_action_blocks.sql
+            - grow_action_kwargs.sql
+            - grow_device_fingerprint_event.sql
+            - grow_edges.sql
+            - grow_hotel_rate_mapping.sql
+            - grow_ocr_pipeline_results_version.sql
+            - grow_priceline_raw_response.sql
+            - grow_relabled_transactions.sql
+            - grow_state_values.sql
+            - grow_values.sql
+            - grow_vertices.sql
+            - update_accounting_coding_body_tracking_category_selection.sql
+            - update_action_blocks.sql
+            - update_action_kwargs.sql
+            - update_denormalized_approval_workflow.sql
+            - update_device_fingerprint_event.sql
+            - update_edges.sql
+            - update_heron_transaction_enriched_log.sql
+            - update_heron_transaction_enrichment_requests.sql
+            - update_hotel_rate_mapping.sql
+            - update_incoming_webhooks.sql
+            - update_manual_transaction.sql
+            - update_ml_receipt_matching_log.sql
+            - update_ocr_pipeine_results_version.sql
+            - update_orc_pipeline_step_results.sql
+            - update_orc_pipeline_step_results_version.sql
+            - update_priceline_raw_response.sql
+            - update_quickbooks_transactions.sql
+            - update_raw_finicity_transaction.sql
+            - update_relabeled_transactions.sql
+            - update_state_values.sql
+            - update_stripe_authorization_event_log.sql
+            - update_transaction.sql
+            - update_values.sql
+            - update_vertices.sql
+      max-parallel: 1 # we want to run each growth workload sequentially (for now there is just one)
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "1h"
+      TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ join(matrix.custom_scripts, ' ') }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      PG_VERSION: 16 # pre-determined by pre-determined project
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      PLATFORM: ${{ matrix.target }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        case "${{ matrix.target }}" in
+          reuse_branch)
+          CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
+          ;;
+          *)
+          echo >&2 "Unknown target=${{ matrix.target }}"
+          exit 1
+          ;;
+        esac
+
+        CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}"
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+        echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT
+
+    - name: pgbench with custom-scripts
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: true
+        extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_growth
+        pg_version: ${{ env.PG_VERSION }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Periodic large oltp tenant growth increase: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -903,12 +903,6 @@ version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"

-[[package]]
-name = "base64"
-version = "0.20.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
-
 [[package]]
 name = "base64"
 version = "0.21.7"
@@ -1300,7 +1294,7 @@ dependencies = [
 "aws-smithy-types",
 "axum",
 "axum-extra",
- "base64 0.13.1",
+ "base64 0.22.1",
 "bytes",
 "camino",
 "cfg-if",
@@ -1426,7 +1420,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "base64 0.13.1",
+ "base64 0.22.1",
 "camino",
 "clap",
 "comfy-table",
@@ -4471,11 +4465,14 @@ dependencies = [
 name = "pageserver_page_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
 "bytes",
+ "futures",
 "pageserver_api",
 "postgres_ffi",
 "prost 0.13.5",
 "thiserror 1.0.69",
+ "tokio",
 "tonic 0.13.1",
 "tonic-build",
 "utils",
@@ -4818,7 +4815,7 @@ dependencies = [
 name = "postgres-protocol2"
 version = "0.1.0"
 dependencies = [
- "base64 0.20.0",
+ "base64 0.22.1",
 "byteorder",
 "bytes",
 "fallible-iterator",
@@ -5190,7 +5187,7 @@ dependencies = [
 "aws-config",
 "aws-sdk-iam",
 "aws-sigv4",
- "base64 0.13.1",
+ "base64 0.22.1",
 "bstr",
 "bytes",
 "camino",
@@ -6494,15 +6491,17 @@ dependencies = [

 [[package]]
 name = "serde_with"
-version = "2.3.3"
+version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe"
+checksum = "d6b6f7f2fcb69f747921f79f3926bd1e203fce4fef62c268dd3abfb6d86029aa"
 dependencies = [
- "base64 0.13.1",
+ "base64 0.22.1",
 "chrono",
 "hex",
 "indexmap 1.9.3",
+ "indexmap 2.9.0",
 "serde",
+ "serde_derive",
 "serde_json",
 "serde_with_macros",
 "time",
@@ -6510,9 +6509,9 @@ dependencies = [

 [[package]]
 name = "serde_with_macros"
-version = "2.3.3"
+version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f"
+checksum = "8d00caa5193a3c8362ac2b73be6b9e768aa5a4b2f721d8f4b339600c3cb51f8e"
 dependencies = [
 "darling",
 "proc-macro2",
@@ -8583,7 +8582,6 @@ dependencies = [
 "anyhow",
 "axum",
 "axum-core",
- "base64 0.13.1",
 "base64 0.21.7",
 "base64ct",
 "bytes",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -72,7 +72,7 @@ aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
 axum = { version = "0.8.1", features = ["ws"] }
 axum-extra = { version = "0.10.0", features = ["typed-header", "query"] }
-base64 = "0.13.0"
+base64 = "0.22"
 bincode = "1.3"
 bindgen = "0.71"
 bit_field = "0.10.2"
@@ -171,7 +171,7 @@ sentry = { version = "0.37", default-features = false, features = ["backtrace",
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
-serde_with = { version = "2.0", features = [ "base64" ] }
+serde_with = { version = "3", features = [ "base64" ] }
 serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
--- a/compute/.gitignore
+++ b/compute/.gitignore
@@ -3,3 +3,6 @@ etc/neon_collector.yml
 etc/neon_collector_autoscaling.yml
 etc/sql_exporter.yml
 etc/sql_exporter_autoscaling.yml
+
+# Node.js dependencies
+node_modules/
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -48,3 +48,11 @@ jsonnetfmt-test:
 .PHONY: jsonnetfmt-format
 jsonnetfmt-format:
 	jsonnetfmt --in-place $(jsonnet_files)
+
+.PHONY: manifest-schema-validation
+manifest-schema-validation: node_modules
+	node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
+
+node_modules: package.json
+	npm install
+	touch node_modules
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -149,8 +149,10 @@ RUN case $DEBIAN_VERSION in \
    ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
+    libclang-dev \
    $VERSION_INSTALLS \
-    && apt clean && rm -rf /var/lib/apt/lists/*
+    && apt clean && rm -rf /var/lib/apt/lists/* && \
+    useradd -ms /bin/bash nonroot -b /home

 #########################################################################################
 #
@@ -1057,17 +1059,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \

 #########################################################################################
 #
-# Layer "pg build with nonroot user and cargo installed"
-# This layer is base and common for layers with `pgrx`
+# Layer "build-deps with Rust toolchain installed"
 #
 #########################################################################################
-FROM pg-build AS pg-build-nonroot-with-cargo
-ARG PG_VERSION
-
-RUN apt update && \
-    apt install --no-install-recommends --no-install-suggests -y curl libclang-dev && \
-    apt clean && rm -rf /var/lib/apt/lists/* && \
-    useradd -ms /bin/bash nonroot -b /home
+FROM build-deps AS build-deps-with-cargo

 ENV HOME=/home/nonroot
 ENV PATH="/home/nonroot/.cargo/bin:$PATH"
@@ -1082,13 +1077,29 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init

+#########################################################################################
+#
+# Layer "pg-build with Rust toolchain installed"
+# This layer is base and common for layers with `pgrx`
+#
+#########################################################################################
+FROM pg-build AS pg-build-with-cargo
+ARG PG_VERSION
+
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:$PATH"
+USER nonroot
+WORKDIR /home/nonroot
+
+COPY --from=build-deps-with-cargo /home/nonroot /home/nonroot
+
 #########################################################################################
 #
 # Layer "rust extensions"
 # This layer is used to build `pgrx` deps
 #
 #########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build
+FROM pg-build-with-cargo AS rust-extensions-build
 ARG PG_VERSION

 RUN case "${PG_VERSION:?}" in \
@@ -1110,7 +1121,7 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx12
+FROM pg-build-with-cargo AS rust-extensions-build-pgrx12
 ARG PG_VERSION

 RUN cargo install --locked --version 0.12.9 cargo-pgrx && \
@@ -1127,7 +1138,7 @@ USER root
 # and eventually get merged with `rust-extensions-build`
 #
 #########################################################################################
-FROM pg-build-nonroot-with-cargo AS rust-extensions-build-pgrx14
+FROM pg-build-with-cargo AS rust-extensions-build-pgrx14
 ARG PG_VERSION

 RUN cargo install --locked --version 0.14.1 cargo-pgrx && \
@@ -1144,10 +1155,12 @@ USER root

 FROM build-deps AS pgrag-src
 ARG PG_VERSION
-
 WORKDIR /ext-src
+COPY compute/patches/onnxruntime.patch .
+
 RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /ext-src/onnxruntime.patch && \
    echo "#nothing to test here" > neon-test.sh

 RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz &&  \
@@ -1722,11 +1735,29 @@ FROM extensions-${EXTENSIONS} AS neon-pg-ext-build
 # Compile the Neon-specific `compute_ctl`, `fast_import`, and `local_proxy` binaries
 #
 #########################################################################################
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools-plan
+ARG BUILD_TAG
+ENV BUILD_TAG=$BUILD_TAG
+
+WORKDIR /home/nonroot
+USER nonroot
+
+# Copy entire project to get Cargo.* files with proper dependencies for the whole project
+COPY --chown=nonroot . .
+RUN cargo chef prepare --recipe-path recipe.json
+
 FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 ARG BUILD_TAG
 ENV BUILD_TAG=$BUILD_TAG

 USER nonroot
+
+COPY --from=compute-tools-plan /home/nonroot/recipe.json recipe.json
+RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
+    --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/git \
+    --mount=type=cache,uid=1000,target=/home/nonroot/target \
+    mold -run cargo chef cook --locked --profile release-line-debug-size-lto --recipe-path recipe.json
+
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN --mount=type=cache,uid=1000,target=/home/nonroot/.cargo/registry \
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -21,6 +21,8 @@ unix_socket_dir=/tmp/
 unix_socket_mode=0777
 ; required for pgbouncer_exporter
 ignore_startup_parameters=extra_float_digits
+; pidfile for graceful termination
+pidfile=/tmp/pgbouncer.pid

 ;; Disable connection logging. It produces a lot of logs that no one looks at,
 ;; and we can get similar log entries from the proxy too. We had incidents in
--- a/compute/manifest.schema.json
+++ b/compute/manifest.schema.json
@@ -0,0 +1,209 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "Neon Compute Manifest Schema",
+  "description": "Schema for Neon compute node configuration manifest",
+  "type": "object",
+  "properties": {
+    "pg_settings": {
+      "type": "object",
+      "properties": {
+        "common": {
+          "type": "object",
+          "properties": {
+            "client_connection_check_interval": {
+              "type": "string",
+              "description": "Check for client disconnection interval in milliseconds"
+            },
+            "effective_io_concurrency": {
+              "type": "string",
+              "description": "Effective IO concurrency setting"
+            },
+            "fsync": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to force fsync to disk"
+            },
+            "hot_standby": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether hot standby is enabled"
+            },
+            "idle_in_transaction_session_timeout": {
+              "type": "string",
+              "description": "Timeout for idle transactions in milliseconds"
+            },
+            "listen_addresses": {
+              "type": "string",
+              "description": "Addresses to listen on"
+            },
+            "log_connections": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to log connections"
+            },
+            "log_disconnections": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to log disconnections"
+            },
+            "log_temp_files": {
+              "type": "string",
+              "description": "Size threshold for logging temporary files in KB"
+            },
+            "log_error_verbosity": {
+              "type": "string",
+              "enum": ["terse", "verbose", "default"],
+              "description": "Error logging verbosity level"
+            },
+            "log_min_error_statement": {
+              "type": "string",
+              "description": "Minimum error level for statement logging"
+            },
+            "maintenance_io_concurrency": {
+              "type": "string",
+              "description": "Maintenance IO concurrency setting"
+            },
+            "max_connections": {
+              "type": "string",
+              "description": "Maximum number of connections"
+            },
+            "max_replication_flush_lag": {
+              "type": "string",
+              "description": "Maximum replication flush lag"
+            },
+            "max_replication_slots": {
+              "type": "string",
+              "description": "Maximum number of replication slots"
+            },
+            "max_replication_write_lag": {
+              "type": "string",
+              "description": "Maximum replication write lag"
+            },
+            "max_wal_senders": {
+              "type": "string",
+              "description": "Maximum number of WAL senders"
+            },
+            "max_wal_size": {
+              "type": "string",
+              "description": "Maximum WAL size"
+            },
+            "neon.unstable_extensions": {
+              "type": "string",
+              "description": "List of unstable extensions"
+            },
+            "neon.protocol_version": {
+              "type": "string",
+              "description": "Neon protocol version"
+            },
+            "password_encryption": {
+              "type": "string",
+              "description": "Password encryption method"
+            },
+            "restart_after_crash": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to restart after crash"
+            },
+            "superuser_reserved_connections": {
+              "type": "string",
+              "description": "Number of reserved connections for superuser"
+            },
+            "synchronous_standby_names": {
+              "type": "string",
+              "description": "Names of synchronous standby servers"
+            },
+            "wal_keep_size": {
+              "type": "string",
+              "description": "WAL keep size"
+            },
+            "wal_level": {
+              "type": "string",
+              "description": "WAL level"
+            },
+            "wal_log_hints": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether to log hints in WAL"
+            },
+            "wal_sender_timeout": {
+              "type": "string",
+              "description": "WAL sender timeout in milliseconds"
+            }
+          },
+          "required": [
+            "client_connection_check_interval",
+            "effective_io_concurrency",
+            "fsync",
+            "hot_standby",
+            "idle_in_transaction_session_timeout",
+            "listen_addresses",
+            "log_connections",
+            "log_disconnections",
+            "log_temp_files",
+            "log_error_verbosity",
+            "log_min_error_statement",
+            "maintenance_io_concurrency",
+            "max_connections",
+            "max_replication_flush_lag",
+            "max_replication_slots",
+            "max_replication_write_lag",
+            "max_wal_senders",
+            "max_wal_size",
+            "neon.unstable_extensions",
+            "neon.protocol_version",
+            "password_encryption",
+            "restart_after_crash",
+            "superuser_reserved_connections",
+            "synchronous_standby_names",
+            "wal_keep_size",
+            "wal_level",
+            "wal_log_hints",
+            "wal_sender_timeout"
+          ]
+        },
+        "replica": {
+          "type": "object",
+          "properties": {
+            "hot_standby": {
+              "type": "string",
+              "enum": ["on", "off"],
+              "description": "Whether hot standby is enabled for replicas"
+            }
+          },
+          "required": ["hot_standby"]
+        },
+        "per_version": {
+          "type": "object",
+          "patternProperties": {
+            "^1[4-7]$": {
+              "type": "object",
+              "properties": {
+                "common": {
+                  "type": "object",
+                  "properties": {
+                    "io_combine_limit": {
+                      "type": "string",
+                      "description": "IO combine limit"
+                    }
+                  }
+                },
+                "replica": {
+                  "type": "object",
+                  "properties": {
+                    "recovery_prefetch": {
+                      "type": "string",
+                      "enum": ["on", "off"],
+                      "description": "Whether to enable recovery prefetch for PostgreSQL replicas"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      },
+      "required": ["common", "replica", "per_version"]
+    }
+  },
+  "required": ["pg_settings"]
+} 
--- a/compute/manifest.yaml
+++ b/compute/manifest.yaml
@@ -105,17 +105,17 @@ pg_settings:
        # Neon hot standby ignores pages that are not in the shared_buffers
        recovery_prefetch: "off"
    16:
-      common:
+      common: {}
      replica:
        # prefetching of blocks referenced in WAL doesn't make sense for us
        # Neon hot standby ignores pages that are not in the shared_buffers
        recovery_prefetch: "off"
    15:
-      common:
+      common: {}
      replica:
        # prefetching of blocks referenced in WAL doesn't make sense for us
        # Neon hot standby ignores pages that are not in the shared_buffers
        recovery_prefetch: "off"
    14:
-      common:
-      replica:
+      common: {}
+      replica: {}
--- a/compute/package-lock.json
+++ b/compute/package-lock.json
@@ -0,0 +1,37 @@
+{
+  "name": "neon-compute",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "neon-compute",
+      "dependencies": {
+        "@sourcemeta/jsonschema": "9.3.4"
+      }
+    },
+    "node_modules/@sourcemeta/jsonschema": {
+      "version": "9.3.4",
+      "resolved": "https://registry.npmjs.org/@sourcemeta/jsonschema/-/jsonschema-9.3.4.tgz",
+      "integrity": "sha512-hkujfkZAIGXUs4U//We9faZW8LZ4/H9LqagRYsFSulH/VLcKPNhZyCTGg7AhORuzm27zqENvKpnX4g2FzudYFw==",
+      "cpu": [
+        "x64",
+        "arm64"
+      ],
+      "license": "AGPL-3.0",
+      "os": [
+        "darwin",
+        "linux",
+        "win32"
+      ],
+      "bin": {
+        "jsonschema": "cli.js"
+      },
+      "engines": {
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sourcemeta"
+      }
+    }
+  }
+}
--- a/compute/package.json
+++ b/compute/package.json
@@ -0,0 +1,7 @@
+{
+  "name": "neon-compute",
+  "private": true,
+  "dependencies": {
+    "@sourcemeta/jsonschema": "9.3.4"
+  }
+} 
--- a/compute/patches/onnxruntime.patch
+++ b/compute/patches/onnxruntime.patch
@@ -0,0 +1,15 @@
+diff --git a/cmake/deps.txt b/cmake/deps.txt
+index d213b09034..229de2ebf0 100644
+--- a/cmake/deps.txt
+++ b/cmake/deps.txt
+@@ -22,7 +22,9 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132
+ # it contains changes on top of 3.4.0 which are required to fix build issues.
+ # Until the 3.4.1 release this is the best option we have.
+ # Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744
+-eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a
+# Moved to github mirror to avoid gitlab issues.Add commentMore actions
+# Issue link: https://github.com/bazelbuild/bazel-central-registry/issues/4355
+eigen;https://github.com/eigen-mirror/eigen/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;61418a349000ba7744a3ad03cf5071f22ebf860a
+ flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c
+ fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494
+ fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -124,6 +124,10 @@ struct Cli {
    /// Interval in seconds for collecting installed extensions statistics
    #[arg(long, default_value = "3600")]
    pub installed_extensions_collection_interval: u64,
+
+    /// Run in development mode, skipping VM-specific operations like process termination
+    #[arg(long, action = clap::ArgAction::SetTrue)]
+    pub dev: bool,
 }

 impl Cli {
@@ -159,7 +163,7 @@ fn main() -> Result<()> {
        .build()?;
    let _rt_guard = runtime.enter();

-    runtime.block_on(init())?;
+    runtime.block_on(init(cli.dev))?;

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -198,13 +202,13 @@ fn main() -> Result<()> {
    deinit_and_exit(exit_code);
 }

-async fn init() -> Result<()> {
+async fn init(dev_mode: bool) -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
    thread::spawn(move || {
        for sig in signals.forever() {
-            handle_exit_signal(sig);
+            handle_exit_signal(sig, dev_mode);
        }
    });

@@ -263,9 +267,9 @@ fn deinit_and_exit(exit_code: Option<i32>) -> ! {
 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
 /// to prevent leakage. TODO: it is better to convert compute_ctl to async and
 /// wait for termination which would be easy then.
-fn handle_exit_signal(sig: i32) {
+fn handle_exit_signal(sig: i32, dev_mode: bool) {
    info!("received {sig} termination signal");
-    forward_termination_signal();
+    forward_termination_signal(dev_mode);
    exit(1);
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -35,6 +35,7 @@ use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
+use utils::pid_file;

 use crate::configurator::launch_configurator;
 use crate::disk_quota::set_disk_quota;
@@ -44,6 +45,7 @@ use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
 use crate::metrics::COMPUTE_CTL_UP;
 use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
+use crate::pgbouncer::*;
 use crate::rsyslog::{
    PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export,
    launch_pgaudit_gc,
@@ -161,6 +163,10 @@ pub struct ComputeState {
    pub lfc_prewarm_state: LfcPrewarmState,
    pub lfc_offload_state: LfcOffloadState,

+    /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
+    /// mode == ComputeMode::Primary. None otherwise
+    pub terminate_flush_lsn: Option<Lsn>,
+
    pub metrics: ComputeMetrics,
 }

@@ -176,6 +182,7 @@ impl ComputeState {
            metrics: ComputeMetrics::default(),
            lfc_prewarm_state: LfcPrewarmState::default(),
            lfc_offload_state: LfcOffloadState::default(),
+            terminate_flush_lsn: None,
        }
    }

@@ -215,6 +222,46 @@ pub struct ParsedSpec {
    pub endpoint_storage_token: Option<String>,
 }

+impl ParsedSpec {
+    pub fn validate(&self) -> Result<(), String> {
+        // Only Primary nodes are using safekeeper_connstrings, and at the moment
+        // this method only validates that part of the specs.
+        if self.spec.mode != ComputeMode::Primary {
+            return Ok(());
+        }
+
+        // While it seems like a good idea to check for an odd number of entries in
+        // the safekeepers connection string, changes to the list of safekeepers might
+        // incur appending a new server to a list of 3, in which case a list of 4
+        // entries is okay in production.
+        //
+        // Still we want unique entries, and at least one entry in the vector
+        if self.safekeeper_connstrings.is_empty() {
+            return Err(String::from("safekeeper_connstrings is empty"));
+        }
+
+        // check for uniqueness of the connection strings in the set
+        let mut connstrings = self.safekeeper_connstrings.clone();
+
+        connstrings.sort();
+        let mut previous = &connstrings[0];
+
+        for current in connstrings.iter().skip(1) {
+            // duplicate entry?
+            if current == previous {
+                return Err(format!(
+                    "duplicate entry in safekeeper_connstrings: {}!",
+                    current,
+                ));
+            }
+
+            previous = current;
+        }
+
+        Ok(())
+    }
+}
+
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
@@ -244,6 +291,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        } else {
            spec.safekeeper_connstrings.clone()
        };
+
        let storage_auth_token = spec.storage_auth_token.clone();
        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
            tenant_id
@@ -278,7 +326,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            .clone()
            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_token"));

-        Ok(ParsedSpec {
+        let res = ParsedSpec {
            spec,
            pageserver_connstr,
            safekeeper_connstrings,
@@ -287,7 +335,11 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
            timeline_id,
            endpoint_storage_addr,
            endpoint_storage_token,
-        })
+        };
+
+        // Now check validity of the parsed specification
+        res.validate()?;
+        Ok(res)
    }
 }

@@ -354,11 +406,6 @@ impl ComputeNode {
        // that can affect `compute_ctl` and prevent it from properly configuring the database schema.
        // Unset them via connection string options before connecting to the database.
        // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
-        //
-        // TODO(ololobus): we currently pass `-c default_transaction_read_only=off` from control plane
-        // as well. After rolling out this code, we can remove this parameter from control plane.
-        // In the meantime, double-passing is fine, the last value is applied.
-        // See: <https://github.com/neondatabase/cloud/blob/133dd8c4dbbba40edfbad475bf6a45073ca63faf/goapp/controlplane/internal/pkg/compute/provisioner/provisioner_common.go#L70>
        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
        let options = match conn_conf.get_options() {
            Some(options) => format!("{} {}", options, EXTRA_OPTIONS),
@@ -489,12 +536,21 @@ impl ComputeNode {
        // Reap the postgres process
        delay_exit |= this.cleanup_after_postgres_exit()?;

+        // /terminate returns LSN. If we don't sleep at all, connection will break and we
+        // won't get result. If we sleep too much, tests will take significantly longer
+        // and Github Action run will error out
+        let sleep_duration = if delay_exit {
+            Duration::from_secs(30)
+        } else {
+            Duration::from_millis(300)
+        };
+
        // If launch failed, keep serving HTTP requests for a while, so the cloud
        // control plane can get the actual error.
        if delay_exit {
            info!("giving control plane 30s to collect the error before shutdown");
-            std::thread::sleep(Duration::from_secs(30));
        }
+        std::thread::sleep(sleep_duration);
        Ok(exit_code)
    }

@@ -866,20 +922,25 @@ impl ComputeNode {
        // Maybe sync safekeepers again, to speed up next startup
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-        if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
+        let lsn = if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
            info!("syncing safekeepers on shutdown");
            let storage_auth_token = pspec.storage_auth_token.clone();
            let lsn = self.sync_safekeepers(storage_auth_token)?;
-            info!("synced safekeepers at lsn {lsn}");
-        }
+            info!(%lsn, "synced safekeepers");
+            Some(lsn)
+        } else {
+            info!("not primary, not syncing safekeepers");
+            None
+        };

        let mut delay_exit = false;
        let mut state = self.state.lock().unwrap();
-        if state.status == ComputeStatus::TerminationPending {
+        state.terminate_flush_lsn = lsn;
+        if let ComputeStatus::TerminationPending { mode } = state.status {
            state.status = ComputeStatus::Terminated;
            self.state_changed.notify_all();
            // we were asked to terminate gracefully, don't exit to avoid restart
-            delay_exit = true
+            delay_exit = mode == compute_api::responses::TerminateMode::Fast
        }
        drop(state);

@@ -1750,7 +1811,7 @@ impl ComputeNode {

                            // exit loop
                            ComputeStatus::Failed
-                            | ComputeStatus::TerminationPending
+                            | ComputeStatus::TerminationPending { .. }
                            | ComputeStatus::Terminated => break 'cert_update,

                            // wait
@@ -2251,12 +2312,68 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
    Ok(())
 }

-pub fn forward_termination_signal() {
+pub fn forward_termination_signal(dev_mode: bool) {
    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
    if ss_pid != 0 {
        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
        kill(ss_pid, Signal::SIGTERM).ok();
    }
+
+    if !dev_mode {
+        //  Terminate pgbouncer with SIGKILL
+        match pid_file::read(PGBOUNCER_PIDFILE.into()) {
+            Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => {
+                info!("sending SIGKILL to pgbouncer process pid: {}", pid);
+                if let Err(e) = kill(pid, Signal::SIGKILL) {
+                    error!("failed to terminate pgbouncer: {}", e);
+                }
+            }
+            // pgbouncer does not lock the pid file, so we read and kill the process directly
+            Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => {
+                if let Ok(pid_str) = std::fs::read_to_string(PGBOUNCER_PIDFILE) {
+                    if let Ok(pid) = pid_str.trim().parse::<i32>() {
+                        info!(
+                            "sending SIGKILL to pgbouncer process pid: {} (from unlocked pid file)",
+                            pid
+                        );
+                        if let Err(e) = kill(Pid::from_raw(pid), Signal::SIGKILL) {
+                            error!("failed to terminate pgbouncer: {}", e);
+                        }
+                    }
+                } else {
+                    info!("pgbouncer pid file exists but process not running");
+                }
+            }
+            Ok(pid_file::PidFileRead::NotExist) => {
+                info!("pgbouncer pid file not found, process may not be running");
+            }
+            Err(e) => {
+                error!("error reading pgbouncer pid file: {}", e);
+            }
+        }
+
+        // Terminate local_proxy
+        match pid_file::read("/etc/local_proxy/pid".into()) {
+            Ok(pid_file::PidFileRead::LockedByOtherProcess(pid)) => {
+                info!("sending SIGTERM to local_proxy process pid: {}", pid);
+                if let Err(e) = kill(pid, Signal::SIGTERM) {
+                    error!("failed to terminate local_proxy: {}", e);
+                }
+            }
+            Ok(pid_file::PidFileRead::NotHeldByAnyProcess(_)) => {
+                info!("local_proxy PID file exists but process not running");
+            }
+            Ok(pid_file::PidFileRead::NotExist) => {
+                info!("local_proxy PID file not found, process may not be running");
+            }
+            Err(e) => {
+                error!("error reading local_proxy PID file: {}", e);
+            }
+        }
+    } else {
+        info!("Skipping pgbouncer and local_proxy termination because in dev mode");
+    }
+
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
@@ -2289,3 +2406,21 @@ impl<T: 'static> JoinSetExt<T> for tokio::task::JoinSet<T> {
        })
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::fs::File;
+
+    use super::*;
+
+    #[test]
+    fn duplicate_safekeeper_connstring() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        match ParsedSpec::try_from(spec.clone()) {
+            Ok(_p) => panic!("Failed to detect duplicate entry"),
+            Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
+        };
+    }
+}
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -1,32 +1,42 @@
-use std::sync::Arc;
-
+use crate::compute::{ComputeNode, forward_termination_signal};
+use crate::http::JsonResponse;
 use axum::extract::State;
-use axum::response::{IntoResponse, Response};
-use compute_api::responses::ComputeStatus;
+use axum::response::Response;
+use axum_extra::extract::OptionalQuery;
+use compute_api::responses::{ComputeStatus, TerminateResponse};
 use http::StatusCode;
+use serde::Deserialize;
+use std::sync::Arc;
 use tokio::task;
 use tracing::info;

-use crate::compute::{ComputeNode, forward_termination_signal};
-use crate::http::JsonResponse;
+#[derive(Deserialize, Default)]
+pub struct TerminateQuery {
+    mode: compute_api::responses::TerminateMode,
+}

 /// Terminate the compute.
-pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>) -> Response {
+pub(in crate::http) async fn terminate(
+    State(compute): State<Arc<ComputeNode>>,
+    OptionalQuery(terminate): OptionalQuery<TerminateQuery>,
+) -> Response {
+    let mode = terminate.unwrap_or_default().mode;
    {
        let mut state = compute.state.lock().unwrap();
        if state.status == ComputeStatus::Terminated {
-            return StatusCode::CREATED.into_response();
+            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
        }

        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
            return JsonResponse::invalid_status(state.status);
        }
-
-        state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
-        drop(state);
+        state.set_status(
+            ComputeStatus::TerminationPending { mode },
+            &compute.state_changed,
+        );
    }

-    forward_termination_signal();
+    forward_termination_signal(false);
    info!("sent signal and notified waiters");

    // Spawn a blocking thread to wait for compute to become Terminated.
@@ -34,7 +44,7 @@ pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>)
    // be able to serve other requests while some particular request
    // is waiting for compute to finish configuration.
    let c = compute.clone();
-    task::spawn_blocking(move || {
+    let lsn = task::spawn_blocking(move || {
        let mut state = c.state.lock().unwrap();
        while state.status != ComputeStatus::Terminated {
            state = c.state_changed.wait(state).unwrap();
@@ -44,11 +54,10 @@ pub(in crate::http) async fn terminate(State(compute): State<Arc<ComputeNode>>)
                state.status
            );
        }
+        state.terminate_flush_lsn
    })
    .await
    .unwrap();
-
    info!("terminated Postgres");
-
-    StatusCode::OK.into_response()
+    JsonResponse::success(StatusCode::OK, TerminateResponse { lsn })
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -22,6 +22,7 @@ mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
+pub mod pgbouncer;
 pub mod rsyslog;
 pub mod spec;
 mod spec_apply;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -83,7 +83,9 @@ impl ComputeMonitor {
        let compute_status = self.compute.get_status();
        if matches!(
            compute_status,
-            ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
+            ComputeStatus::Terminated
+                | ComputeStatus::TerminationPending { .. }
+                | ComputeStatus::Failed
        ) {
            info!(
                "compute is in {} status, stopping compute monitor",
--- a/compute_tools/src/pgbouncer.rs
+++ b/compute_tools/src/pgbouncer.rs
@@ -0,0 +1 @@
+pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid";
--- a/compute_tools/tests/README.md
+++ b/compute_tools/tests/README.md
@@ -0,0 +1,6 @@
+### Test files
+
+The file `cluster_spec.json` has been copied over from libs/compute_api
+tests, with some edits:
+
+  - the neon.safekeepers setting contains a duplicate value
--- a/compute_tools/tests/cluster_spec.json
+++ b/compute_tools/tests/cluster_spec.json
@@ -0,0 +1,245 @@
+{
+  "format_version": 1.0,
+
+  "timestamp": "2021-05-23T18:25:43.511Z",
+  "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
+
+  "cluster": {
+    "cluster_id": "test-cluster-42",
+    "name": "Zenith Test",
+    "state": "restarted",
+    "roles": [
+      {
+        "name": "postgres",
+        "encrypted_password": "6b1d16b78004bbd51fa06af9eda75972",
+        "options": null
+      },
+      {
+        "name": "alexk",
+        "encrypted_password": null,
+        "options": null
+      },
+      {
+        "name": "zenith \"new\"",
+        "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972",
+        "options": null
+      },
+      {
+        "name": "zen",
+        "encrypted_password": "9b1d16b78004bbd51fa06af9eda75972"
+      },
+      {
+        "name": "\"name\";\\n select 1;",
+        "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+      },
+      {
+        "name": "MyRole",
+        "encrypted_password": "5b1d16b78004bbd51fa06af9eda75972"
+      }
+    ],
+    "databases": [
+      {
+        "name": "DB2",
+        "owner": "alexk",
+        "options": [
+          {
+            "name": "LC_COLLATE",
+            "value": "C",
+            "vartype": "string"
+          },
+          {
+            "name": "LC_CTYPE",
+            "value": "C",
+            "vartype": "string"
+          },
+          {
+            "name": "TEMPLATE",
+            "value": "template0",
+            "vartype": "enum"
+          }
+        ]
+      },
+      {
+        "name": "zenith",
+        "owner": "MyRole"
+      },
+      {
+        "name": "zen",
+        "owner": "zen"
+      }
+    ],
+    "settings": [
+      {
+        "name": "fsync",
+        "value": "off",
+        "vartype": "bool"
+      },
+      {
+        "name": "wal_level",
+        "value": "logical",
+        "vartype": "enum"
+      },
+      {
+        "name": "hot_standby",
+        "value": "on",
+        "vartype": "bool"
+      },
+      {
+        "name": "prewarm_lfc_on_startup",
+        "value": "off",
+        "vartype": "bool"
+      },
+      {
+        "name": "neon.safekeepers",
+        "value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501,127.0.0.1:6502",
+        "vartype": "string"
+      },
+      {
+        "name": "wal_log_hints",
+        "value": "on",
+        "vartype": "bool"
+      },
+      {
+        "name": "log_connections",
+        "value": "on",
+        "vartype": "bool"
+      },
+      {
+        "name": "shared_buffers",
+        "value": "32768",
+        "vartype": "integer"
+      },
+      {
+        "name": "port",
+        "value": "55432",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_connections",
+        "value": "100",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_wal_senders",
+        "value": "10",
+        "vartype": "integer"
+      },
+      {
+        "name": "listen_addresses",
+        "value": "0.0.0.0",
+        "vartype": "string"
+      },
+      {
+        "name": "wal_sender_timeout",
+        "value": "0",
+        "vartype": "integer"
+      },
+      {
+        "name": "password_encryption",
+        "value": "md5",
+        "vartype": "enum"
+      },
+      {
+        "name": "maintenance_work_mem",
+        "value": "65536",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_parallel_workers",
+        "value": "8",
+        "vartype": "integer"
+      },
+      {
+        "name": "max_worker_processes",
+        "value": "8",
+        "vartype": "integer"
+      },
+      {
+        "name": "neon.tenant_id",
+        "value": "b0554b632bd4d547a63b86c3630317e8",
+        "vartype": "string"
+      },
+      {
+        "name": "max_replication_slots",
+        "value": "10",
+        "vartype": "integer"
+      },
+      {
+        "name": "neon.timeline_id",
+        "value": "2414a61ffc94e428f14b5758fe308e13",
+        "vartype": "string"
+      },
+      {
+        "name": "shared_preload_libraries",
+        "value": "neon",
+        "vartype": "string"
+      },
+      {
+        "name": "synchronous_standby_names",
+        "value": "walproposer",
+        "vartype": "string"
+      },
+      {
+        "name": "neon.pageserver_connstring",
+        "value": "host=127.0.0.1 port=6400",
+        "vartype": "string"
+      },
+      {
+        "name": "test.escaping",
+        "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray",
+        "vartype": "string"
+      }
+    ]
+  },
+  "delta_operations": [
+    {
+      "action": "delete_db",
+      "name": "zenith_test"
+    },
+    {
+      "action": "rename_db",
+      "name": "DB",
+      "new_name": "DB2"
+    },
+    {
+      "action": "delete_role",
+      "name": "zenith2"
+    },
+    {
+      "action": "rename_role",
+      "name": "zenith new",
+      "new_name": "zenith \"new\""
+    }
+  ],
+  "remote_extensions": {
+    "library_index": {
+      "postgis-3": "postgis",
+      "libpgrouting-3.4": "postgis",
+      "postgis_raster-3": "postgis",
+      "postgis_sfcgal-3": "postgis",
+      "postgis_topology-3": "postgis",
+      "address_standardizer-3": "postgis"
+    },
+    "extension_data": {
+      "postgis": {
+        "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
+        "control_data": {
+          "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
+          "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
+          "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
+          "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
+          "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
+          "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
+          "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
+          "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
+        }
+      }
+    },
+    "custom_extensions": [],
+    "public_extensions": ["postgis"]
+  },
+  "pgbouncer_settings": {
+    "default_pool_size": "42",
+    "pool_mode": "session"
+  }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -18,7 +18,7 @@ use clap::Parser;
 use compute_api::requests::ComputeClaimsScope;
 use compute_api::spec::ComputeMode;
 use control_plane::broker::StorageBroker;
-use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol};
 use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
 use control_plane::local_env;
 use control_plane::local_env::{
@@ -605,6 +605,14 @@ struct EndpointCreateCmdArgs {
    #[clap(long, help = "Postgres version")]
    pg_version: u32,

+    /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
+    ///
+    /// Specified on creation such that it's retained across reconfiguration and restarts.
+    ///
+    /// NB: not yet supported by computes.
+    #[clap(long)]
+    grpc: bool,
+
    #[clap(
        long,
        help = "If set, the node will be a hot replica on the specified timeline",
@@ -664,6 +672,13 @@ struct EndpointStartCmdArgs {
    #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
    #[arg(default_value = "90s")]
    start_timeout: Duration,
+
+    #[clap(
+        long,
+        help = "Run in development mode, skipping VM-specific operations like process termination",
+        action = clap::ArgAction::SetTrue
+    )]
+    dev: bool,
 }

 #[derive(clap::Args)]
@@ -696,10 +711,9 @@ struct EndpointStopCmdArgs {
    )]
    destroy: bool,

-    #[clap(long, help = "Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")]
-    #[arg(value_parser(["smart", "fast", "immediate"]))]
-    #[arg(default_value = "fast")]
-    mode: String,
+    #[clap(long, help = "Postgres shutdown mode")]
+    #[clap(default_value = "fast")]
+    mode: EndpointTerminateMode,
 }

 #[derive(clap::Args)]
@@ -1451,6 +1465,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                args.internal_http_port,
                args.pg_version,
                mode,
+                args.grpc,
                !args.update_catalog,
                false,
            )?;
@@ -1491,13 +1506,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res

            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
-                let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
-                (
-                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
-                    // If caller is telling us what pageserver to use, this is not a tenant which is
-                    // full managed by storage controller, therefore not sharded.
-                    DEFAULT_STRIPE_SIZE,
-                )
+                // Use gRPC if requested.
+                let pageserver = if endpoint.grpc {
+                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    (PageserverProtocol::Grpc, host, port)
+                } else {
+                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
+                    let port = port.unwrap_or(5432);
+                    (PageserverProtocol::Libpq, host, port)
+                };
+                // If caller is telling us what pageserver to use, this is not a tenant which is
+                // fully managed by storage controller, therefore not sharded.
+                (vec![pageserver], DEFAULT_STRIPE_SIZE)
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
@@ -1516,11 +1538,20 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                                .await?;
                        }

-                        anyhow::Ok((
-                            Host::parse(&shard.listen_pg_addr)
-                                .expect("Storage controller reported bad hostname"),
-                            shard.listen_pg_port,
-                        ))
+                        let pageserver = if endpoint.grpc {
+                            (
+                                PageserverProtocol::Grpc,
+                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
+                                shard.listen_grpc_port.expect("no gRPC port"),
+                            )
+                        } else {
+                            (
+                                PageserverProtocol::Libpq,
+                                Host::parse(&shard.listen_pg_addr)?,
+                                shard.listen_pg_port,
+                            )
+                        };
+                        anyhow::Ok(pageserver)
                    }),
                )
                .await?;
@@ -1565,6 +1596,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    stripe_size.0 as usize,
                    args.create_test_user,
                    args.start_timeout,
+                    args.dev,
                )
                .await?;
        }
@@ -1575,11 +1607,19 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
            let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
-                let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
-                vec![(
-                    pageserver.pg_connection_config.host().clone(),
-                    pageserver.pg_connection_config.port(),
-                )]
+                let conf = env.get_pageserver_conf(ps_id)?;
+                // Use gRPC if requested.
+                let pageserver = if endpoint.grpc {
+                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    (PageserverProtocol::Grpc, host, port)
+                } else {
+                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
+                    let port = port.unwrap_or(5432);
+                    (PageserverProtocol::Libpq, host, port)
+                };
+                vec![pageserver]
            } else {
                let storage_controller = StorageController::from_env(env);
                storage_controller
@@ -1588,11 +1628,21 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    .shards
                    .into_iter()
                    .map(|shard| {
-                        (
-                            Host::parse(&shard.listen_pg_addr)
-                                .expect("Storage controller reported malformed host"),
-                            shard.listen_pg_port,
-                        )
+                        // Use gRPC if requested.
+                        if endpoint.grpc {
+                            (
+                                PageserverProtocol::Grpc,
+                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
+                                    .expect("bad hostname"),
+                                shard.listen_grpc_port.expect("no gRPC port"),
+                            )
+                        } else {
+                            (
+                                PageserverProtocol::Libpq,
+                                Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
+                                shard.listen_pg_port,
+                            )
+                        }
                    })
                    .collect::<Vec<_>>()
            };
@@ -1607,7 +1657,10 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .endpoints
                .get(endpoint_id)
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            endpoint.stop(&args.mode, args.destroy)?;
+            match endpoint.stop(args.mode, args.destroy).await?.lsn {
+                Some(lsn) => println!("{lsn}"),
+                None => println!("null"),
+            }
        }
        EndpointCmd::GenerateJwt(args) => {
            let endpoint = {
@@ -2039,11 +2092,16 @@ async fn handle_stop_all(args: &StopCmdArgs, env: &local_env::LocalEnv) -> Resul
 }

 async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
+    let mode = if immediate {
+        EndpointTerminateMode::Immediate
+    } else {
+        EndpointTerminateMode::Fast
+    };
    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
+                if let Err(e) = node.stop(mode, false).await {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -37,6 +37,7 @@
 //! ```
 //!
 use std::collections::BTreeMap;
+use std::fmt::Display;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
 use std::path::PathBuf;
 use std::process::Command;
@@ -45,11 +46,14 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};

 use anyhow::{Context, Result, anyhow, bail};
+use base64::Engine;
+use base64::prelude::BASE64_URL_SAFE_NO_PAD;
 use compute_api::requests::{
    COMPUTE_AUDIENCE, ComputeClaims, ComputeClaimsScope, ConfigurationRequest,
 };
 use compute_api::responses::{
-    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TlsConfig,
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse,
+    TlsConfig,
 };
 use compute_api::spec::{
    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
@@ -74,7 +78,6 @@ use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;
-use crate::storage_controller::StorageController;

 // contents of a endpoint.json file
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -87,6 +90,7 @@ pub struct EndpointConf {
    external_http_port: u16,
    internal_http_port: u16,
    pg_version: u32,
+    grpc: bool,
    skip_pg_catalog_updates: bool,
    reconfigure_concurrency: usize,
    drop_subscriptions_before_start: bool,
@@ -164,7 +168,7 @@ impl ComputeControlPlane {
                    public_key_use: Some(PublicKeyUse::Signature),
                    key_operations: Some(vec![KeyOperations::Verify]),
                    key_algorithm: Some(KeyAlgorithm::EdDSA),
-                    key_id: Some(base64::encode_config(key_hash, base64::URL_SAFE_NO_PAD)),
+                    key_id: Some(BASE64_URL_SAFE_NO_PAD.encode(key_hash)),
                    x509_url: None::<String>,
                    x509_chain: None::<Vec<String>>,
                    x509_sha1_fingerprint: None::<String>,
@@ -173,7 +177,7 @@ impl ComputeControlPlane {
                algorithm: AlgorithmParameters::OctetKeyPair(OctetKeyPairParameters {
                    key_type: OctetKeyPairType::OctetKeyPair,
                    curve: EllipticCurve::Ed25519,
-                    x: base64::encode_config(public_key, base64::URL_SAFE_NO_PAD),
+                    x: BASE64_URL_SAFE_NO_PAD.encode(public_key),
                }),
            }],
        })
@@ -190,6 +194,7 @@ impl ComputeControlPlane {
        internal_http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
+        grpc: bool,
        skip_pg_catalog_updates: bool,
        drop_subscriptions_before_start: bool,
    ) -> Result<Arc<Endpoint>> {
@@ -224,6 +229,7 @@ impl ComputeControlPlane {
            // we also skip catalog updates in the cloud.
            skip_pg_catalog_updates,
            drop_subscriptions_before_start,
+            grpc,
            reconfigure_concurrency: 1,
            features: vec![],
            cluster: None,
@@ -242,6 +248,7 @@ impl ComputeControlPlane {
                internal_http_port,
                pg_port,
                pg_version,
+                grpc,
                skip_pg_catalog_updates,
                drop_subscriptions_before_start,
                reconfigure_concurrency: 1,
@@ -296,6 +303,8 @@ pub struct Endpoint {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mode: ComputeMode,
+    /// If true, the endpoint should use gRPC to communicate with Pageservers.
+    pub grpc: bool,

    // port and address of the Postgres server and `compute_ctl`'s HTTP APIs
    pub pg_address: SocketAddr,
@@ -331,15 +340,58 @@ pub enum EndpointStatus {
    RunningNoPidfile,
 }

-impl std::fmt::Display for EndpointStatus {
+impl Display for EndpointStatus {
    fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result {
-        let s = match self {
+        writer.write_str(match self {
            Self::Running => "running",
            Self::Stopped => "stopped",
            Self::Crashed => "crashed",
            Self::RunningNoPidfile => "running, no pidfile",
-        };
-        write!(writer, "{}", s)
+        })
+    }
+}
+
+#[derive(Default, Clone, Copy, clap::ValueEnum)]
+pub enum EndpointTerminateMode {
+    #[default]
+    /// Use pg_ctl stop -m fast
+    Fast,
+    /// Use pg_ctl stop -m immediate
+    Immediate,
+    /// Use /terminate?mode=immediate
+    ImmediateTerminate,
+}
+
+impl std::fmt::Display for EndpointTerminateMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(match &self {
+            EndpointTerminateMode::Fast => "fast",
+            EndpointTerminateMode::Immediate => "immediate",
+            EndpointTerminateMode::ImmediateTerminate => "immediate-terminate",
+        })
+    }
+}
+
+/// Protocol used to connect to a Pageserver.
+#[derive(Clone, Copy, Debug)]
+pub enum PageserverProtocol {
+    Libpq,
+    Grpc,
+}
+
+impl PageserverProtocol {
+    /// Returns the URL scheme for the protocol, used in connstrings.
+    pub fn scheme(&self) -> &'static str {
+        match self {
+            Self::Libpq => "postgresql",
+            Self::Grpc => "grpc",
+        }
+    }
+}
+
+impl Display for PageserverProtocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.scheme())
    }
 }

@@ -378,6 +430,7 @@ impl Endpoint {
            mode: conf.mode,
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
+            grpc: conf.grpc,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
            reconfigure_concurrency: conf.reconfigure_concurrency,
            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
@@ -606,10 +659,10 @@ impl Endpoint {
        }
    }

-    fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
+    fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
        pageservers
            .iter()
-            .map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
+            .map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
            .collect::<Vec<_>>()
            .join(",")
    }
@@ -654,11 +707,12 @@ impl Endpoint {
        endpoint_storage_addr: String,
        safekeepers_generation: Option<SafekeeperGeneration>,
        safekeepers: Vec<NodeId>,
-        pageservers: Vec<(Host, u16)>,
+        pageservers: Vec<(PageserverProtocol, Host, u16)>,
        remote_ext_base_url: Option<&String>,
        shard_stripe_size: usize,
        create_test_user: bool,
        start_timeout: Duration,
+        dev: bool,
    ) -> Result<()> {
        if self.status() == EndpointStatus::Running {
            anyhow::bail!("The endpoint is already running");
@@ -829,6 +883,10 @@ impl Endpoint {
            cmd.args(["--remote-ext-base-url", remote_ext_base_url]);
        }

+        if dev {
+            cmd.arg("--dev");
+        }
+
        let child = cmd.spawn()?;
        // set up a scopeguard to kill & wait for the child in case we panic or bail below
        let child = scopeguard::guard(child, |mut child| {
@@ -881,7 +939,7 @@ impl Endpoint {
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
                        | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::TerminationPending { .. }
                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
@@ -939,10 +997,12 @@ impl Endpoint {

    pub async fn reconfigure(
        &self,
-        mut pageservers: Vec<(Host, u16)>,
+        pageservers: Vec<(PageserverProtocol, Host, u16)>,
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
    ) -> Result<()> {
+        anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
+
        let (mut spec, compute_ctl_config) = {
            let config_path = self.endpoint_path().join("config.json");
            let file = std::fs::File::open(config_path)?;
@@ -954,25 +1014,7 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        // If we weren't given explicit pageservers, query the storage controller
-        if pageservers.is_empty() {
-            let storage_controller = StorageController::from_env(&self.env);
-            let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
-            pageservers = locate_result
-                .shards
-                .into_iter()
-                .map(|shard| {
-                    (
-                        Host::parse(&shard.listen_pg_addr)
-                            .expect("Storage controller reported bad hostname"),
-                        shard.listen_pg_port,
-                    )
-                })
-                .collect::<Vec<_>>();
-        }
-
        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
-        assert!(!pageserver_connstr.is_empty());
        spec.pageserver_connstring = Some(pageserver_connstr);
        if stripe_size.is_some() {
            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
@@ -1019,8 +1061,27 @@ impl Endpoint {
        }
    }

-    pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
-        self.pg_ctl(&["-m", mode, "stop"], &None)?;
+    pub async fn stop(
+        &self,
+        mode: EndpointTerminateMode,
+        destroy: bool,
+    ) -> Result<TerminateResponse> {
+        // pg_ctl stop is fast but doesn't allow us to collect LSN. /terminate is
+        // slow, and test runs time out. Solution: special mode "immediate-terminate"
+        // which uses /terminate
+        let response = if let EndpointTerminateMode::ImmediateTerminate = mode {
+            let ip = self.external_http_address.ip();
+            let port = self.external_http_address.port();
+            let url = format!("http://{ip}:{port}/terminate?mode=immediate");
+            let token = self.generate_jwt(Some(ComputeClaimsScope::Admin))?;
+            let request = reqwest::Client::new().post(url).bearer_auth(token);
+            let response = request.send().await.context("/terminate")?;
+            let text = response.text().await.context("/terminate result")?;
+            serde_json::from_str(&text).with_context(|| format!("deserializing {text}"))?
+        } else {
+            self.pg_ctl(&["-m", &mode.to_string(), "stop"], &None)?;
+            TerminateResponse { lsn: None }
+        };

        // Also wait for the compute_ctl process to die. It might have some
        // cleanup work to do after postgres stops, like syncing safekeepers,
@@ -1030,7 +1091,7 @@ impl Endpoint {
        // waiting. Sometimes we do *not* want this cleanup: tests intentionally
        // do stop when majority of safekeepers is down, so sync-safekeepers
        // would hang otherwise. This could be a separate flag though.
-        let send_sigterm = destroy || mode == "immediate";
+        let send_sigterm = destroy || !matches!(mode, EndpointTerminateMode::Fast);
        self.wait_for_compute_ctl_to_exit(send_sigterm)?;
        if destroy {
            println!(
@@ -1039,7 +1100,7 @@ impl Endpoint {
            );
            std::fs::remove_dir_all(self.endpoint_path())?;
        }
-        Ok(())
+        Ok(response)
    }

    pub fn connstr(&self, user: &str, db_name: &str) -> String {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -16,6 +16,7 @@ use std::time::Duration;

 use anyhow::{Context, bail};
 use camino::Utf8PathBuf;
+use pageserver_api::config::{DEFAULT_GRPC_LISTEN_PORT, DEFAULT_HTTP_LISTEN_PORT};
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -252,9 +253,10 @@ impl PageServerNode {
        // the storage controller
        let metadata_path = datadir.join("metadata.json");

-        let (_http_host, http_port) =
+        let http_host = "localhost".to_string();
+        let (_, http_port) =
            parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
-        let http_port = http_port.unwrap_or(9898);
+        let http_port = http_port.unwrap_or(DEFAULT_HTTP_LISTEN_PORT);

        let https_port = match self.conf.listen_https_addr.as_ref() {
            Some(https_addr) => {
@@ -265,6 +267,13 @@ impl PageServerNode {
            None => None,
        };

+        let (mut grpc_host, mut grpc_port) = (None, None);
+        if let Some(grpc_addr) = &self.conf.listen_grpc_addr {
+            let (_, port) = parse_host_port(grpc_addr).expect("Unable to parse listen_grpc_addr");
+            grpc_host = Some("localhost".to_string());
+            grpc_port = Some(port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT));
+        }
+
        // Intentionally hand-craft JSON: this acts as an implicit format compat test
        // in case the pageserver-side structure is edited, and reflects the real life
        // situation: the metadata is written by some other script.
@@ -273,7 +282,9 @@ impl PageServerNode {
            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
                postgres_host: "localhost".to_string(),
                postgres_port: self.pg_connection_config.port(),
-                http_host: "localhost".to_string(),
+                grpc_host,
+                grpc_port,
+                http_host,
                http_port,
                https_port,
                other: HashMap::from([(
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -36,6 +36,10 @@ enum Command {
        listen_pg_addr: String,
        #[arg(long)]
        listen_pg_port: u16,
+        #[arg(long)]
+        listen_grpc_addr: Option<String>,
+        #[arg(long)]
+        listen_grpc_port: Option<u16>,

        #[arg(long)]
        listen_http_addr: String,
@@ -418,6 +422,8 @@ async fn main() -> anyhow::Result<()> {
            node_id,
            listen_pg_addr,
            listen_pg_port,
+            listen_grpc_addr,
+            listen_grpc_port,
            listen_http_addr,
            listen_http_port,
            listen_https_port,
@@ -431,6 +437,8 @@ async fn main() -> anyhow::Result<()> {
                        node_id,
                        listen_pg_addr,
                        listen_pg_port,
+                        listen_grpc_addr,
+                        listen_grpc_port,
                        listen_http_addr,
                        listen_http_port,
                        listen_https_port,
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -95,3 +95,4 @@ echo "Start compute node"
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-${RANDOM}"                          \
     --config "${CONFIG_FILE}"
+     --dev
--- a/endpoint_storage/src/main.rs
+++ b/endpoint_storage/src/main.rs
@@ -31,13 +31,12 @@ struct Args {
 }

 #[derive(serde::Deserialize)]
-#[serde(tag = "type")]
 struct Config {
    #[serde(default = "listen")]
    listen: std::net::SocketAddr,
    pemfile: camino::Utf8PathBuf,
    #[serde(flatten)]
-    storage_config: remote_storage::RemoteStorageConfig,
+    storage_kind: remote_storage::TypedRemoteStorageKind,
    #[serde(default = "max_upload_file_limit")]
    max_upload_file_limit: usize,
 }
@@ -70,7 +69,8 @@ async fn main() -> anyhow::Result<()> {
    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
    info!("listening on {}", listener.local_addr().unwrap());

-    let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
+    let storage =
+        remote_storage::GenericRemoteStorage::from_storage_kind(config.storage_kind).await?;
    let cancel = tokio_util::sync::CancellationToken::new();
    if !args.no_s3_check_on_startup {
        app::check_storage_permissions(&storage, cancel.clone()).await?;
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -83,6 +83,16 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum TerminateMode {
+    #[default]
+    /// wait 30s till returning from /terminate to allow control plane to get the error
+    Fast,
+    /// return from /terminate immediately as soon as all components are terminated
+    Immediate,
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -103,11 +113,16 @@ pub enum ComputeStatus {
    // control-plane to terminate it.
    Failed,
    // Termination requested
-    TerminationPending,
+    TerminationPending { mode: TerminateMode },
    // Terminated Postgres
    Terminated,
 }

+#[derive(Deserialize, Serialize)]
+pub struct TerminateResponse {
+    pub lsn: Option<utils::lsn::Lsn>,
+}
+
 impl Display for ComputeStatus {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
@@ -117,7 +132,7 @@ impl Display for ComputeStatus {
            ComputeStatus::Running => f.write_str("running"),
            ComputeStatus::Configuration => f.write_str("configuration"),
            ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending => f.write_str("termination-pending"),
+            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
            ComputeStatus::Terminated => f.write_str("terminated"),
        }
    }
--- a/libs/desim/src/executor.rs
+++ b/libs/desim/src/executor.rs
@@ -419,13 +419,13 @@ pub fn now() -> u64 {
    with_thread_context(|ctx| ctx.clock.get().unwrap().now())
 }

-pub fn exit(code: i32, msg: String) {
+pub fn exit(code: i32, msg: String) -> ! {
    with_thread_context(|ctx| {
        ctx.allow_panic.store(true, Ordering::SeqCst);
        let mut result = ctx.result.lock();
        *result = (code, msg);
        panic!("exit");
-    });
+    })
 }

 pub(crate) fn get_thread_ctx() -> Arc<ThreadContext> {
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -12,6 +12,7 @@ pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LI
 pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051

 use std::collections::HashMap;
+use std::fmt::Display;
 use std::num::{NonZeroU64, NonZeroUsize};
 use std::str::FromStr;
 use std::time::Duration;
@@ -24,16 +25,17 @@ use utils::logging::LogFormat;
 use crate::models::{ImageCompressionAlgorithm, LsnLease};

 // Certain metadata (e.g. externally-addressable name, AZ) is delivered
-// as a separate structure.  This information is not neeed by the pageserver
+// as a separate structure.  This information is not needed by the pageserver
 // itself, it is only used for registering the pageserver with the control
 // plane and/or storage controller.
-//
 #[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
 pub struct NodeMetadata {
    #[serde(rename = "host")]
    pub postgres_host: String,
    #[serde(rename = "port")]
    pub postgres_port: u16,
+    pub grpc_host: Option<String>,
+    pub grpc_port: Option<u16>,
    pub http_host: String,
    pub http_port: u16,
    pub https_port: Option<u16>,
@@ -44,6 +46,23 @@ pub struct NodeMetadata {
    pub other: HashMap<String, serde_json::Value>,
 }

+impl Display for NodeMetadata {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "postgresql://{}:{} ",
+            self.postgres_host, self.postgres_port
+        )?;
+        if let Some(grpc_host) = &self.grpc_host {
+            let grpc_port = self.grpc_port.unwrap_or_default();
+            write!(f, "grpc://{grpc_host}:{grpc_port} ")?;
+        }
+        write!(f, "http://{}:{} ", self.http_host, self.http_port)?;
+        write!(f, "other:{:?}", self.other)?;
+        Ok(())
+    }
+}
+
 /// PostHog integration config.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct PostHogConfig {
@@ -337,16 +356,21 @@ pub struct TimelineImportConfig {
 pub struct BasebackupCacheConfig {
    #[serde(with = "humantime_serde")]
    pub cleanup_period: Duration,
-    // FIXME: Support max_size_bytes.
-    // pub max_size_bytes: usize,
-    pub max_size_entries: i64,
+    /// Maximum total size of basebackup cache entries on disk in bytes.
+    /// The cache may slightly exceed this limit because we do not know
+    /// the exact size of the cache entry untill it's written to disk.
+    pub max_total_size_bytes: u64,
+    // TODO(diko): support max_entry_size_bytes.
+    // pub max_entry_size_bytes: u64,
+    pub max_size_entries: usize,
 }

 impl Default for BasebackupCacheConfig {
    fn default() -> Self {
        Self {
            cleanup_period: Duration::from_secs(60),
-            // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
+            max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
+            // max_entry_size_bytes: 16 * 1024 * 1024,   // 16 MiB
            max_size_entries: 1000,
        }
    }
--- a/libs/pageserver_api/src/config/tests.rs
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -14,6 +14,8 @@ fn test_node_metadata_v1_backward_compatibilty() {
        NodeMetadata {
            postgres_host: "localhost".to_string(),
            postgres_port: 23,
+            grpc_host: None,
+            grpc_port: None,
            http_host: "localhost".to_string(),
            http_port: 42,
            https_port: None,
@@ -37,6 +39,35 @@ fn test_node_metadata_v2_backward_compatibilty() {
        NodeMetadata {
            postgres_host: "localhost".to_string(),
            postgres_port: 23,
+            grpc_host: None,
+            grpc_port: None,
+            http_host: "localhost".to_string(),
+            http_port: 42,
+            https_port: Some(123),
+            other: HashMap::new(),
+        }
+    )
+}
+
+#[test]
+fn test_node_metadata_v3_backward_compatibilty() {
+    let v3 = serde_json::to_vec(&serde_json::json!({
+        "host": "localhost",
+        "port": 23,
+        "grpc_host": "localhost",
+        "grpc_port": 51,
+        "http_host": "localhost",
+        "http_port": 42,
+        "https_port": 123,
+    }));
+
+    assert_eq!(
+        serde_json::from_slice::<NodeMetadata>(&v3.unwrap()).unwrap(),
+        NodeMetadata {
+            postgres_host: "localhost".to_string(),
+            postgres_port: 23,
+            grpc_host: Some("localhost".to_string()),
+            grpc_port: Some(51),
            http_host: "localhost".to_string(),
            http_port: 42,
            https_port: Some(123),
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -52,6 +52,8 @@ pub struct NodeRegisterRequest {

    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
+    pub listen_grpc_addr: Option<String>,
+    pub listen_grpc_port: Option<u16>,

    pub listen_http_addr: String,
    pub listen_http_port: u16,
@@ -101,6 +103,8 @@ pub struct TenantLocateResponseShard {

    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
+    pub listen_grpc_addr: Option<String>,
+    pub listen_grpc_port: Option<u16>,

    pub listen_http_addr: String,
    pub listen_http_port: u16,
@@ -152,6 +156,8 @@ pub struct NodeDescribeResponse {

    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
+    pub listen_grpc_addr: Option<String>,
+    pub listen_grpc_port: Option<u16>,
 }

 #[derive(Serialize, Deserialize, Debug)]
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,6 +5,7 @@ pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
+pub mod pagestream_api;
 pub mod record;
 pub mod reltag;
 pub mod shard;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -5,16 +5,12 @@ pub mod utilization;
 use core::ops::Range;
 use std::collections::HashMap;
 use std::fmt::Display;
-use std::io::{BufRead, Read};
 use std::num::{NonZeroU32, NonZeroU64, NonZeroUsize};
 use std::str::FromStr;
 use std::time::{Duration, SystemTime};

-use byteorder::{BigEndian, ReadBytesExt};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
 #[cfg(feature = "testing")]
 use camino::Utf8PathBuf;
-use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 pub use utilization::PageserverUtilization;
@@ -24,7 +20,6 @@ use utils::{completion, serde_system_time};

 use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
-use crate::reltag::RelTag;
 use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};

 /// The state of a tenant in this pageserver.
@@ -1907,219 +1902,6 @@ pub struct ScanDisposableKeysResponse {
    pub not_disposable_count: usize,
 }

-// Wrapped in libpq CopyData
-#[derive(PartialEq, Eq, Debug)]
-pub enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-    GetSlruSegment(PagestreamGetSlruSegmentRequest),
-    #[cfg(feature = "testing")]
-    Test(PagestreamTestRequest),
-}
-
-// Wrapped in libpq CopyData
-#[derive(Debug, strum_macros::EnumProperty)]
-pub enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-    GetSlruSegment(PagestreamGetSlruSegmentResponse),
-    #[cfg(feature = "testing")]
-    Test(PagestreamTestResponse),
-}
-
-// Keep in sync with `pagestore_client.h`
-#[repr(u8)]
-enum PagestreamFeMessageTag {
-    Exists = 0,
-    Nblocks = 1,
-    GetPage = 2,
-    DbSize = 3,
-    GetSlruSegment = 4,
-    /* future tags above this line */
-    /// For testing purposes, not available in production.
-    #[cfg(feature = "testing")]
-    Test = 99,
-}
-
-// Keep in sync with `pagestore_client.h`
-#[repr(u8)]
-enum PagestreamBeMessageTag {
-    Exists = 100,
-    Nblocks = 101,
-    GetPage = 102,
-    Error = 103,
-    DbSize = 104,
-    GetSlruSegment = 105,
-    /* future tags above this line */
-    /// For testing purposes, not available in production.
-    #[cfg(feature = "testing")]
-    Test = 199,
-}
-
-impl TryFrom<u8> for PagestreamFeMessageTag {
-    type Error = u8;
-    fn try_from(value: u8) -> Result<Self, u8> {
-        match value {
-            0 => Ok(PagestreamFeMessageTag::Exists),
-            1 => Ok(PagestreamFeMessageTag::Nblocks),
-            2 => Ok(PagestreamFeMessageTag::GetPage),
-            3 => Ok(PagestreamFeMessageTag::DbSize),
-            4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
-            #[cfg(feature = "testing")]
-            99 => Ok(PagestreamFeMessageTag::Test),
-            _ => Err(value),
-        }
-    }
-}
-
-impl TryFrom<u8> for PagestreamBeMessageTag {
-    type Error = u8;
-    fn try_from(value: u8) -> Result<Self, u8> {
-        match value {
-            100 => Ok(PagestreamBeMessageTag::Exists),
-            101 => Ok(PagestreamBeMessageTag::Nblocks),
-            102 => Ok(PagestreamBeMessageTag::GetPage),
-            103 => Ok(PagestreamBeMessageTag::Error),
-            104 => Ok(PagestreamBeMessageTag::DbSize),
-            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
-            #[cfg(feature = "testing")]
-            199 => Ok(PagestreamBeMessageTag::Test),
-            _ => Err(value),
-        }
-    }
-}
-
-// A GetPage request contains two LSN values:
-//
-// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
-// "get the latest version present". It's used by the primary server, which knows that no one else
-// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
-// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
-//
-// not_modified_since: Hint to the pageserver that the client knows that the page has not been
-// modified between 'not_modified_since' and the request LSN. It's always correct to set
-// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
-// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
-// request without waiting for 'request_lsn' to arrive.
-//
-// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
-// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
-// 'latest' was set to true. The V2 interface was added because there was no correct way for a
-// standby to request a page at a particular non-latest LSN, and also include the
-// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
-// request, if the standby knows that the page hasn't been modified since, and risk getting an error
-// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
-// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
-// difference in the responses between V1 and V2.
-//
-// V3 version of protocol adds request ID to all requests. This request ID is also included in response
-// as well as other fields from requests, which allows to verify that we receive response for our request.
-// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
-// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
-//
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub enum PagestreamProtocolVersion {
-    V2,
-    V3,
-}
-
-pub type RequestId = u64;
-
-#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamRequest {
-    pub reqid: RequestId,
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamExistsRequest {
-    pub hdr: PagestreamRequest,
-    pub rel: RelTag,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamNblocksRequest {
-    pub hdr: PagestreamRequest,
-    pub rel: RelTag,
-}
-
-#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamGetPageRequest {
-    pub hdr: PagestreamRequest,
-    pub rel: RelTag,
-    pub blkno: u32,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamDbSizeRequest {
-    pub hdr: PagestreamRequest,
-    pub dbnode: u32,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
-pub struct PagestreamGetSlruSegmentRequest {
-    pub hdr: PagestreamRequest,
-    pub kind: u8,
-    pub segno: u32,
-}
-
-#[derive(Debug)]
-pub struct PagestreamExistsResponse {
-    pub req: PagestreamExistsRequest,
-    pub exists: bool,
-}
-
-#[derive(Debug)]
-pub struct PagestreamNblocksResponse {
-    pub req: PagestreamNblocksRequest,
-    pub n_blocks: u32,
-}
-
-#[derive(Debug)]
-pub struct PagestreamGetPageResponse {
-    pub req: PagestreamGetPageRequest,
-    pub page: Bytes,
-}
-
-#[derive(Debug)]
-pub struct PagestreamGetSlruSegmentResponse {
-    pub req: PagestreamGetSlruSegmentRequest,
-    pub segment: Bytes,
-}
-
-#[derive(Debug)]
-pub struct PagestreamErrorResponse {
-    pub req: PagestreamRequest,
-    pub message: String,
-}
-
-#[derive(Debug)]
-pub struct PagestreamDbSizeResponse {
-    pub req: PagestreamDbSizeRequest,
-    pub db_size: i64,
-}
-
-#[cfg(feature = "testing")]
-#[derive(Debug, PartialEq, Eq, Clone)]
-pub struct PagestreamTestRequest {
-    pub hdr: PagestreamRequest,
-    pub batch_key: u64,
-    pub message: String,
-}
-
-#[cfg(feature = "testing")]
-#[derive(Debug)]
-pub struct PagestreamTestResponse {
-    pub req: PagestreamTestRequest,
-}
-
 // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
 // that require pageserver-internal types.  It is sufficient to get the total size.
 #[derive(Serialize, Deserialize, Debug)]
@@ -2131,506 +1913,6 @@ pub struct TenantHistorySize {
    pub size: Option<u64>,
 }

-impl PagestreamFeMessage {
-    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 3.
-    pub fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-            }
-
-            Self::Nblocks(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-            }
-
-            Self::GetPage(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.rel.spcnode);
-                bytes.put_u32(req.rel.dbnode);
-                bytes.put_u32(req.rel.relnode);
-                bytes.put_u8(req.rel.forknum);
-                bytes.put_u32(req.blkno);
-            }
-
-            Self::DbSize(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u32(req.dbnode);
-            }
-
-            Self::GetSlruSegment(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u8(req.kind);
-                bytes.put_u32(req.segno);
-            }
-            #[cfg(feature = "testing")]
-            Self::Test(req) => {
-                bytes.put_u8(PagestreamFeMessageTag::Test as u8);
-                bytes.put_u64(req.hdr.reqid);
-                bytes.put_u64(req.hdr.request_lsn.0);
-                bytes.put_u64(req.hdr.not_modified_since.0);
-                bytes.put_u64(req.batch_key);
-                let message = req.message.as_bytes();
-                bytes.put_u64(message.len() as u64);
-                bytes.put_slice(message);
-            }
-        }
-
-        bytes.into()
-    }
-
-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.read_u8()?;
-        let (reqid, request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                0,
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V3 => (
-                body.read_u64::<BigEndian>()?,
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-        };
-
-        match PagestreamFeMessageTag::try_from(msg_tag)
-            .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
-        {
-            PagestreamFeMessageTag::Exists => {
-                Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    rel: RelTag {
-                        spcnode: body.read_u32::<BigEndian>()?,
-                        dbnode: body.read_u32::<BigEndian>()?,
-                        relnode: body.read_u32::<BigEndian>()?,
-                        forknum: body.read_u8()?,
-                    },
-                }))
-            }
-            PagestreamFeMessageTag::Nblocks => {
-                Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    rel: RelTag {
-                        spcnode: body.read_u32::<BigEndian>()?,
-                        dbnode: body.read_u32::<BigEndian>()?,
-                        relnode: body.read_u32::<BigEndian>()?,
-                        forknum: body.read_u8()?,
-                    },
-                }))
-            }
-            PagestreamFeMessageTag::GetPage => {
-                Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    rel: RelTag {
-                        spcnode: body.read_u32::<BigEndian>()?,
-                        dbnode: body.read_u32::<BigEndian>()?,
-                        relnode: body.read_u32::<BigEndian>()?,
-                        forknum: body.read_u8()?,
-                    },
-                    blkno: body.read_u32::<BigEndian>()?,
-                }))
-            }
-            PagestreamFeMessageTag::DbSize => {
-                Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    dbnode: body.read_u32::<BigEndian>()?,
-                }))
-            }
-            PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
-                PagestreamGetSlruSegmentRequest {
-                    hdr: PagestreamRequest {
-                        reqid,
-                        request_lsn,
-                        not_modified_since,
-                    },
-                    kind: body.read_u8()?,
-                    segno: body.read_u32::<BigEndian>()?,
-                },
-            )),
-            #[cfg(feature = "testing")]
-            PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                batch_key: body.read_u64::<BigEndian>()?,
-                message: {
-                    let len = body.read_u64::<BigEndian>()?;
-                    let mut buf = vec![0; len as usize];
-                    body.read_exact(&mut buf)?;
-                    String::from_utf8(buf)?
-                },
-            })),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        use PagestreamBeMessageTag as Tag;
-        match protocol_version {
-            PagestreamProtocolVersion::V2 => {
-                match self {
-                    Self::Exists(resp) => {
-                        bytes.put_u8(Tag::Exists as u8);
-                        bytes.put_u8(resp.exists as u8);
-                    }
-
-                    Self::Nblocks(resp) => {
-                        bytes.put_u8(Tag::Nblocks as u8);
-                        bytes.put_u32(resp.n_blocks);
-                    }
-
-                    Self::GetPage(resp) => {
-                        bytes.put_u8(Tag::GetPage as u8);
-                        bytes.put(&resp.page[..])
-                    }
-
-                    Self::Error(resp) => {
-                        bytes.put_u8(Tag::Error as u8);
-                        bytes.put(resp.message.as_bytes());
-                        bytes.put_u8(0); // null terminator
-                    }
-                    Self::DbSize(resp) => {
-                        bytes.put_u8(Tag::DbSize as u8);
-                        bytes.put_i64(resp.db_size);
-                    }
-
-                    Self::GetSlruSegment(resp) => {
-                        bytes.put_u8(Tag::GetSlruSegment as u8);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                        bytes.put(&resp.segment[..]);
-                    }
-
-                    #[cfg(feature = "testing")]
-                    Self::Test(resp) => {
-                        bytes.put_u8(Tag::Test as u8);
-                        bytes.put_u64(resp.req.batch_key);
-                        let message = resp.req.message.as_bytes();
-                        bytes.put_u64(message.len() as u64);
-                        bytes.put_slice(message);
-                    }
-                }
-            }
-            PagestreamProtocolVersion::V3 => {
-                match self {
-                    Self::Exists(resp) => {
-                        bytes.put_u8(Tag::Exists as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u8(resp.exists as u8);
-                    }
-
-                    Self::Nblocks(resp) => {
-                        bytes.put_u8(Tag::Nblocks as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u32(resp.n_blocks);
-                    }
-
-                    Self::GetPage(resp) => {
-                        bytes.put_u8(Tag::GetPage as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.rel.spcnode);
-                        bytes.put_u32(resp.req.rel.dbnode);
-                        bytes.put_u32(resp.req.rel.relnode);
-                        bytes.put_u8(resp.req.rel.forknum);
-                        bytes.put_u32(resp.req.blkno);
-                        bytes.put(&resp.page[..])
-                    }
-
-                    Self::Error(resp) => {
-                        bytes.put_u8(Tag::Error as u8);
-                        bytes.put_u64(resp.req.reqid);
-                        bytes.put_u64(resp.req.request_lsn.0);
-                        bytes.put_u64(resp.req.not_modified_since.0);
-                        bytes.put(resp.message.as_bytes());
-                        bytes.put_u8(0); // null terminator
-                    }
-                    Self::DbSize(resp) => {
-                        bytes.put_u8(Tag::DbSize as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u32(resp.req.dbnode);
-                        bytes.put_i64(resp.db_size);
-                    }
-
-                    Self::GetSlruSegment(resp) => {
-                        bytes.put_u8(Tag::GetSlruSegment as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u8(resp.req.kind);
-                        bytes.put_u32(resp.req.segno);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
-                        bytes.put(&resp.segment[..]);
-                    }
-
-                    #[cfg(feature = "testing")]
-                    Self::Test(resp) => {
-                        bytes.put_u8(Tag::Test as u8);
-                        bytes.put_u64(resp.req.hdr.reqid);
-                        bytes.put_u64(resp.req.hdr.request_lsn.0);
-                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
-                        bytes.put_u64(resp.req.batch_key);
-                        let message = resp.req.message.as_bytes();
-                        bytes.put_u64(message.len() as u64);
-                        bytes.put_slice(message);
-                    }
-                }
-            }
-        }
-        bytes.into()
-    }
-
-    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
-        let mut buf = buf.reader();
-        let msg_tag = buf.read_u8()?;
-
-        use PagestreamBeMessageTag as Tag;
-        let ok =
-            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
-                Tag::Exists => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let exists = buf.read_u8()? != 0;
-                    Self::Exists(PagestreamExistsResponse {
-                        req: PagestreamExistsRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                        },
-                        exists,
-                    })
-                }
-                Tag::Nblocks => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let n_blocks = buf.read_u32::<BigEndian>()?;
-                    Self::Nblocks(PagestreamNblocksResponse {
-                        req: PagestreamNblocksRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                        },
-                        n_blocks,
-                    })
-                }
-                Tag::GetPage => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let rel = RelTag {
-                        spcnode: buf.read_u32::<BigEndian>()?,
-                        dbnode: buf.read_u32::<BigEndian>()?,
-                        relnode: buf.read_u32::<BigEndian>()?,
-                        forknum: buf.read_u8()?,
-                    };
-                    let blkno = buf.read_u32::<BigEndian>()?;
-                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
-                    buf.read_exact(&mut page)?;
-                    Self::GetPage(PagestreamGetPageResponse {
-                        req: PagestreamGetPageRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            rel,
-                            blkno,
-                        },
-                        page: page.into(),
-                    })
-                }
-                Tag::Error => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let mut msg = Vec::new();
-                    buf.read_until(0, &mut msg)?;
-                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
-                    let rust_str = cstring.to_str()?;
-                    Self::Error(PagestreamErrorResponse {
-                        req: PagestreamRequest {
-                            reqid,
-                            request_lsn,
-                            not_modified_since,
-                        },
-                        message: rust_str.to_owned(),
-                    })
-                }
-                Tag::DbSize => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let dbnode = buf.read_u32::<BigEndian>()?;
-                    let db_size = buf.read_i64::<BigEndian>()?;
-                    Self::DbSize(PagestreamDbSizeResponse {
-                        req: PagestreamDbSizeRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            dbnode,
-                        },
-                        db_size,
-                    })
-                }
-                Tag::GetSlruSegment => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let kind = buf.read_u8()?;
-                    let segno = buf.read_u32::<BigEndian>()?;
-                    let n_blocks = buf.read_u32::<BigEndian>()?;
-                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
-                    buf.read_exact(&mut segment)?;
-                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
-                        req: PagestreamGetSlruSegmentRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            kind,
-                            segno,
-                        },
-                        segment: segment.into(),
-                    })
-                }
-                #[cfg(feature = "testing")]
-                Tag::Test => {
-                    let reqid = buf.read_u64::<BigEndian>()?;
-                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
-                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
-                    let batch_key = buf.read_u64::<BigEndian>()?;
-                    let len = buf.read_u64::<BigEndian>()?;
-                    let mut msg = vec![0; len as usize];
-                    buf.read_exact(&mut msg)?;
-                    let message = String::from_utf8(msg)?;
-                    Self::Test(PagestreamTestResponse {
-                        req: PagestreamTestRequest {
-                            hdr: PagestreamRequest {
-                                reqid,
-                                request_lsn,
-                                not_modified_since,
-                            },
-                            batch_key,
-                            message,
-                        },
-                    })
-                }
-            };
-        let remaining = buf.into_inner();
-        if !remaining.is_empty() {
-            anyhow::bail!(
-                "remaining bytes in msg with tag={msg_tag}: {}",
-                remaining.len()
-            );
-        }
-        Ok(ok)
-    }
-
-    pub fn kind(&self) -> &'static str {
-        match self {
-            Self::Exists(_) => "Exists",
-            Self::Nblocks(_) => "Nblocks",
-            Self::GetPage(_) => "GetPage",
-            Self::Error(_) => "Error",
-            Self::DbSize(_) => "DbSize",
-            Self::GetSlruSegment(_) => "GetSlruSegment",
-            #[cfg(feature = "testing")]
-            Self::Test(_) => "Test",
-        }
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct PageTraceEvent {
    pub key: CompactKey,
@@ -2656,68 +1938,6 @@ mod tests {

    use super::*;

-    #[test]
-    fn test_pagestream() {
-        // Test serialization/deserialization of PagestreamFeMessage
-        let messages = vec![
-            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-            }),
-            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(4),
-                },
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-            }),
-            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
-                rel: RelTag {
-                    forknum: 1,
-                    spcnode: 2,
-                    dbnode: 3,
-                    relnode: 4,
-                },
-                blkno: 7,
-            }),
-            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(4),
-                    not_modified_since: Lsn(3),
-                },
-                dbnode: 7,
-            }),
-        ];
-        for msg in messages {
-            let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
-                    .unwrap();
-            assert!(msg == reconstructed);
-        }
-    }
-
    #[test]
    fn test_tenantinfo_serde() {
        // Test serialization/deserialization of TenantInfo
--- a/libs/pageserver_api/src/pagestream_api.rs
+++ b/libs/pageserver_api/src/pagestream_api.rs
@@ -0,0 +1,792 @@
+//! Rust definitions of the libpq-based pagestream API
+//!
+//! See also the C implementation of the same API in pgxn/neon/pagestore_client.h
+
+use std::io::{BufRead, Read};
+
+use crate::reltag::RelTag;
+
+use byteorder::{BigEndian, ReadBytesExt};
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use postgres_ffi::BLCKSZ;
+use utils::lsn::Lsn;
+
+// Wrapped in libpq CopyData
+#[derive(PartialEq, Eq, Debug)]
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+    GetSlruSegment(PagestreamGetSlruSegmentRequest),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestRequest),
+}
+
+// Wrapped in libpq CopyData
+#[derive(Debug, strum_macros::EnumProperty)]
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+    GetSlruSegment(PagestreamGetSlruSegmentResponse),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestResponse),
+}
+
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamFeMessageTag {
+    Exists = 0,
+    Nblocks = 1,
+    GetPage = 2,
+    DbSize = 3,
+    GetSlruSegment = 4,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 99,
+}
+
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamBeMessageTag {
+    Exists = 100,
+    Nblocks = 101,
+    GetPage = 102,
+    Error = 103,
+    DbSize = 104,
+    GetSlruSegment = 105,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 199,
+}
+
+impl TryFrom<u8> for PagestreamFeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            0 => Ok(PagestreamFeMessageTag::Exists),
+            1 => Ok(PagestreamFeMessageTag::Nblocks),
+            2 => Ok(PagestreamFeMessageTag::GetPage),
+            3 => Ok(PagestreamFeMessageTag::DbSize),
+            4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            99 => Ok(PagestreamFeMessageTag::Test),
+            _ => Err(value),
+        }
+    }
+}
+
+impl TryFrom<u8> for PagestreamBeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            100 => Ok(PagestreamBeMessageTag::Exists),
+            101 => Ok(PagestreamBeMessageTag::Nblocks),
+            102 => Ok(PagestreamBeMessageTag::GetPage),
+            103 => Ok(PagestreamBeMessageTag::Error),
+            104 => Ok(PagestreamBeMessageTag::DbSize),
+            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            199 => Ok(PagestreamBeMessageTag::Test),
+            _ => Err(value),
+        }
+    }
+}
+
+// A GetPage request contains two LSN values:
+//
+// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
+// "get the latest version present". It's used by the primary server, which knows that no one else
+// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
+// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
+//
+// not_modified_since: Hint to the pageserver that the client knows that the page has not been
+// modified between 'not_modified_since' and the request LSN. It's always correct to set
+// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
+// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
+// request without waiting for 'request_lsn' to arrive.
+//
+// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
+// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
+// 'latest' was set to true. The V2 interface was added because there was no correct way for a
+// standby to request a page at a particular non-latest LSN, and also include the
+// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
+// request, if the standby knows that the page hasn't been modified since, and risk getting an error
+// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
+// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
+// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
+// difference in the responses between V1 and V2.
+//
+// V3 version of protocol adds request ID to all requests. This request ID is also included in response
+// as well as other fields from requests, which allows to verify that we receive response for our request.
+// We copy fields from request to response to make checking more reliable: request ID is formed from process ID
+// and local counter, so in principle there can be duplicated requests IDs if process PID is reused.
+//
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum PagestreamProtocolVersion {
+    V2,
+    V3,
+}
+
+pub type RequestId = u64;
+
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamRequest {
+    pub reqid: RequestId,
+    pub request_lsn: Lsn,
+    pub not_modified_since: Lsn,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamExistsRequest {
+    pub hdr: PagestreamRequest,
+    pub rel: RelTag,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamNblocksRequest {
+    pub hdr: PagestreamRequest,
+    pub rel: RelTag,
+}
+
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamGetPageRequest {
+    pub hdr: PagestreamRequest,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamDbSizeRequest {
+    pub hdr: PagestreamRequest,
+    pub dbnode: u32,
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct PagestreamGetSlruSegmentRequest {
+    pub hdr: PagestreamRequest,
+    pub kind: u8,
+    pub segno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub req: PagestreamExistsRequest,
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub req: PagestreamNblocksRequest,
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub req: PagestreamGetPageRequest,
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetSlruSegmentResponse {
+    pub req: PagestreamGetSlruSegmentRequest,
+    pub segment: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub req: PagestreamRequest,
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub req: PagestreamDbSizeRequest,
+    pub db_size: i64,
+}
+
+#[cfg(feature = "testing")]
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PagestreamTestRequest {
+    pub hdr: PagestreamRequest,
+    pub batch_key: u64,
+    pub message: String,
+}
+
+#[cfg(feature = "testing")]
+#[derive(Debug)]
+pub struct PagestreamTestResponse {
+    pub req: PagestreamTestRequest,
+}
+
+impl PagestreamFeMessage {
+    /// Serialize a compute -> pageserver message. This is currently only used in testing
+    /// tools. Always uses protocol version 3.
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::Nblocks(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+            }
+
+            Self::GetPage(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.rel.spcnode);
+                bytes.put_u32(req.rel.dbnode);
+                bytes.put_u32(req.rel.relnode);
+                bytes.put_u8(req.rel.forknum);
+                bytes.put_u32(req.blkno);
+            }
+
+            Self::DbSize(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u32(req.dbnode);
+            }
+
+            Self::GetSlruSegment(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u8(req.kind);
+                bytes.put_u32(req.segno);
+            }
+            #[cfg(feature = "testing")]
+            Self::Test(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Test as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.batch_key);
+                let message = req.message.as_bytes();
+                bytes.put_u64(message.len() as u64);
+                bytes.put_slice(message);
+            }
+        }
+
+        bytes.into()
+    }
+
+    pub fn parse<R: std::io::Read>(
+        body: &mut R,
+        protocol_version: PagestreamProtocolVersion,
+    ) -> anyhow::Result<PagestreamFeMessage> {
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.read_u8()?;
+        let (reqid, request_lsn, not_modified_since) = match protocol_version {
+            PagestreamProtocolVersion::V2 => (
+                0,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+            PagestreamProtocolVersion::V3 => (
+                body.read_u64::<BigEndian>()?,
+                Lsn::from(body.read_u64::<BigEndian>()?),
+                Lsn::from(body.read_u64::<BigEndian>()?),
+            ),
+        };
+
+        match PagestreamFeMessageTag::try_from(msg_tag)
+            .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
+        {
+            PagestreamFeMessageTag::Exists => {
+                Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::Nblocks => {
+                Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::GetPage => {
+                Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                    blkno: body.read_u32::<BigEndian>()?,
+                }))
+            }
+            PagestreamFeMessageTag::DbSize => {
+                Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    dbnode: body.read_u32::<BigEndian>()?,
+                }))
+            }
+            PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
+                PagestreamGetSlruSegmentRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    kind: body.read_u8()?,
+                    segno: body.read_u32::<BigEndian>()?,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
+                batch_key: body.read_u64::<BigEndian>()?,
+                message: {
+                    let len = body.read_u64::<BigEndian>()?;
+                    let mut buf = vec![0; len as usize];
+                    body.read_exact(&mut buf)?;
+                    String::from_utf8(buf)?
+                },
+            })),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self, protocol_version: PagestreamProtocolVersion) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        use PagestreamBeMessageTag as Tag;
+        match protocol_version {
+            PagestreamProtocolVersion::V2 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u8(resp.exists as u8);
+                    }
+
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u32(resp.n_blocks);
+                    }
+
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put(&resp.page[..])
+                    }
+
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_i64(resp.db_size);
+                    }
+
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
+                }
+            }
+            PagestreamProtocolVersion::V3 => {
+                match self {
+                    Self::Exists(resp) => {
+                        bytes.put_u8(Tag::Exists as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u8(resp.exists as u8);
+                    }
+
+                    Self::Nblocks(resp) => {
+                        bytes.put_u8(Tag::Nblocks as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.n_blocks);
+                    }
+
+                    Self::GetPage(resp) => {
+                        bytes.put_u8(Tag::GetPage as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.rel.spcnode);
+                        bytes.put_u32(resp.req.rel.dbnode);
+                        bytes.put_u32(resp.req.rel.relnode);
+                        bytes.put_u8(resp.req.rel.forknum);
+                        bytes.put_u32(resp.req.blkno);
+                        bytes.put(&resp.page[..])
+                    }
+
+                    Self::Error(resp) => {
+                        bytes.put_u8(Tag::Error as u8);
+                        bytes.put_u64(resp.req.reqid);
+                        bytes.put_u64(resp.req.request_lsn.0);
+                        bytes.put_u64(resp.req.not_modified_since.0);
+                        bytes.put(resp.message.as_bytes());
+                        bytes.put_u8(0); // null terminator
+                    }
+                    Self::DbSize(resp) => {
+                        bytes.put_u8(Tag::DbSize as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u32(resp.req.dbnode);
+                        bytes.put_i64(resp.db_size);
+                    }
+
+                    Self::GetSlruSegment(resp) => {
+                        bytes.put_u8(Tag::GetSlruSegment as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u8(resp.req.kind);
+                        bytes.put_u32(resp.req.segno);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put(&resp.segment[..]);
+                    }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
+                }
+            }
+        }
+        bytes.into()
+    }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+
+        use PagestreamBeMessageTag as Tag;
+        let ok =
+            match Tag::try_from(msg_tag).map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))? {
+                Tag::Exists => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let exists = buf.read_u8()? != 0;
+                    Self::Exists(PagestreamExistsResponse {
+                        req: PagestreamExistsRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        exists,
+                    })
+                }
+                Tag::Nblocks => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    Self::Nblocks(PagestreamNblocksResponse {
+                        req: PagestreamNblocksRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                        },
+                        n_blocks,
+                    })
+                }
+                Tag::GetPage => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let rel = RelTag {
+                        spcnode: buf.read_u32::<BigEndian>()?,
+                        dbnode: buf.read_u32::<BigEndian>()?,
+                        relnode: buf.read_u32::<BigEndian>()?,
+                        forknum: buf.read_u8()?,
+                    };
+                    let blkno = buf.read_u32::<BigEndian>()?;
+                    let mut page = vec![0; 8192]; // TODO: use MaybeUninit
+                    buf.read_exact(&mut page)?;
+                    Self::GetPage(PagestreamGetPageResponse {
+                        req: PagestreamGetPageRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            rel,
+                            blkno,
+                        },
+                        page: page.into(),
+                    })
+                }
+                Tag::Error => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let mut msg = Vec::new();
+                    buf.read_until(0, &mut msg)?;
+                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
+                    let rust_str = cstring.to_str()?;
+                    Self::Error(PagestreamErrorResponse {
+                        req: PagestreamRequest {
+                            reqid,
+                            request_lsn,
+                            not_modified_since,
+                        },
+                        message: rust_str.to_owned(),
+                    })
+                }
+                Tag::DbSize => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let dbnode = buf.read_u32::<BigEndian>()?;
+                    let db_size = buf.read_i64::<BigEndian>()?;
+                    Self::DbSize(PagestreamDbSizeResponse {
+                        req: PagestreamDbSizeRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            dbnode,
+                        },
+                        db_size,
+                    })
+                }
+                Tag::GetSlruSegment => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let kind = buf.read_u8()?;
+                    let segno = buf.read_u32::<BigEndian>()?;
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
+                    buf.read_exact(&mut segment)?;
+                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
+                        req: PagestreamGetSlruSegmentRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            kind,
+                            segno,
+                        },
+                        segment: segment.into(),
+                    })
+                }
+                #[cfg(feature = "testing")]
+                Tag::Test => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let batch_key = buf.read_u64::<BigEndian>()?;
+                    let len = buf.read_u64::<BigEndian>()?;
+                    let mut msg = vec![0; len as usize];
+                    buf.read_exact(&mut msg)?;
+                    let message = String::from_utf8(msg)?;
+                    Self::Test(PagestreamTestResponse {
+                        req: PagestreamTestRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            batch_key,
+                            message,
+                        },
+                    })
+                }
+            };
+        let remaining = buf.into_inner();
+        if !remaining.is_empty() {
+            anyhow::bail!(
+                "remaining bytes in msg with tag={msg_tag}: {}",
+                remaining.len()
+            );
+        }
+        Ok(ok)
+    }
+
+    pub fn kind(&self) -> &'static str {
+        match self {
+            Self::Exists(_) => "Exists",
+            Self::Nblocks(_) => "Nblocks",
+            Self::GetPage(_) => "GetPage",
+            Self::Error(_) => "Error",
+            Self::DbSize(_) => "DbSize",
+            Self::GetSlruSegment(_) => "GetSlruSegment",
+            #[cfg(feature = "testing")]
+            Self::Test(_) => "Test",
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pagestream() {
+        // Test serialization/deserialization of PagestreamFeMessage
+        let messages = vec![
+            PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(4),
+                },
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+            }),
+            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
+                rel: RelTag {
+                    forknum: 1,
+                    spcnode: 2,
+                    dbnode: 3,
+                    relnode: 4,
+                },
+                blkno: 7,
+            }),
+            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(4),
+                    not_modified_since: Lsn(3),
+                },
+                dbnode: 7,
+            }),
+        ];
+        for msg in messages {
+            let bytes = msg.serialize();
+            let reconstructed =
+                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V3)
+                    .unwrap();
+            assert!(msg == reconstructed);
+        }
+    }
+}
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -9,7 +9,7 @@ use utils::id::{NodeId, TimelineId};

 use crate::controller_api::NodeRegisterRequest;
 use crate::models::{LocationConfigMode, ShardImportStatus};
-use crate::shard::TenantShardId;
+use crate::shard::{ShardStripeSize, TenantShardId};

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -36,6 +36,10 @@ pub struct ReAttachResponseTenant {
    /// Default value only for backward compat: this field should be set
    #[serde(default = "default_mode")]
    pub mode: LocationConfigMode,
+
+    // Default value only for backward compat: this field should be set
+    #[serde(default = "ShardStripeSize::default")]
+    pub stripe_size: ShardStripeSize,
 }
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -55,9 +55,16 @@ impl FeatureResolverBackgroundLoop {
                            continue;
                        }
                    };
-                    let feature_store = FeatureStore::new_with_flags(resp.flags);
-                    this.feature_store.store(Arc::new(feature_store));
-                    tracing::info!("Feature flag updated");
+                    let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
+                    match FeatureStore::new_with_flags(resp.flags, project_id) {
+                        Ok(feature_store) => {
+                            this.feature_store.store(Arc::new(feature_store));
+                            tracing::info!("Feature flag updated");
+                        }
+                        Err(e) => {
+                            tracing::warn!("Cannot process feature flag spec: {}", e);
+                        }
+                    }
                }
                tracing::info!("PostHog feature resolver stopped");
            }
--- a/libs/posthog_client_lite/src/lib.rs
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -39,6 +39,9 @@ pub struct LocalEvaluationResponse {

 #[derive(Deserialize)]
 pub struct LocalEvaluationFlag {
+    #[allow(dead_code)]
+    id: u64,
+    team_id: u64,
    key: String,
    filters: LocalEvaluationFlagFilters,
    active: bool,
@@ -107,17 +110,32 @@ impl FeatureStore {
        }
    }

-    pub fn new_with_flags(flags: Vec<LocalEvaluationFlag>) -> Self {
+    pub fn new_with_flags(
+        flags: Vec<LocalEvaluationFlag>,
+        project_id: Option<u64>,
+    ) -> Result<Self, &'static str> {
        let mut store = Self::new();
-        store.set_flags(flags);
-        store
+        store.set_flags(flags, project_id)?;
+        Ok(store)
    }

-    pub fn set_flags(&mut self, flags: Vec<LocalEvaluationFlag>) {
+    pub fn set_flags(
+        &mut self,
+        flags: Vec<LocalEvaluationFlag>,
+        project_id: Option<u64>,
+    ) -> Result<(), &'static str> {
        self.flags.clear();
        for flag in flags {
+            if let Some(project_id) = project_id {
+                if flag.team_id != project_id {
+                    return Err(
+                        "Retrieved a spec with different project id, wrong config? Discarding the feature flags.",
+                    );
+                }
+            }
            self.flags.insert(flag.key.clone(), flag);
        }
+        Ok(())
    }

    /// Generate a consistent hash for a user ID (e.g., tenant ID).
@@ -534,6 +552,13 @@ impl PostHogClient {
        })
    }

+    /// Check if the server API key is a feature flag secure API key. This key can only be
+    /// used to fetch the feature flag specs and can only be used on a undocumented API
+    /// endpoint.
+    fn is_feature_flag_secure_api_key(&self) -> bool {
+        self.config.server_api_key.starts_with("phs_")
+    }
+
    /// Fetch the feature flag specs from the server.
    ///
    /// This is unfortunately an undocumented API at:
@@ -547,10 +572,22 @@ impl PostHogClient {
    ) -> anyhow::Result<LocalEvaluationResponse> {
        // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
        // with bearer token of self.server_api_key
-        let url = format!(
-            "{}/api/projects/{}/feature_flags/local_evaluation",
-            self.config.private_api_url, self.config.project_id
-        );
+        // OR
+        // BASE_URL/api/feature_flag/local_evaluation/
+        // with bearer token of feature flag specific self.server_api_key
+        let url = if self.is_feature_flag_secure_api_key() {
+            // The new feature local evaluation secure API token
+            format!(
+                "{}/api/feature_flag/local_evaluation",
+                self.config.private_api_url
+            )
+        } else {
+            // The old personal API token
+            format!(
+                "{}/api/projects/{}/feature_flags/local_evaluation",
+                self.config.private_api_url, self.config.project_id
+            )
+        };
        let response = self
            .client
            .get(url)
@@ -803,7 +840,7 @@ mod tests {
    fn evaluate_multivariate() {
        let mut store = FeatureStore::new();
        let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
-        store.set_flags(response.flags);
+        store.set_flags(response.flags, None).unwrap();

        // This lacks the required properties and cannot be evaluated.
        let variant =
@@ -873,7 +910,7 @@ mod tests {

        let mut store = FeatureStore::new();
        let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
-        store.set_flags(response.flags);
+        store.set_flags(response.flags, None).unwrap();

        // This lacks the required properties and cannot be evaluated.
        let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &HashMap::new());
@@ -929,7 +966,7 @@ mod tests {

        let mut store = FeatureStore::new();
        let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
-        store.set_flags(response.flags);
+        store.set_flags(response.flags, None).unwrap();

        // This lacks the required properties and cannot be evaluated.
        let variant =
--- a/libs/proxy/postgres-protocol2/Cargo.toml
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2024"
 license = "MIT/Apache-2.0"

 [dependencies]
-base64 = "0.20"
+base64.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 fallible-iterator.workspace = true
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -3,6 +3,8 @@
 use std::fmt::Write;
 use std::{io, iter, mem, str};

+use base64::Engine as _;
+use base64::prelude::BASE64_STANDARD;
 use hmac::{Hmac, Mac};
 use rand::{self, Rng};
 use sha2::digest::FixedOutput;
@@ -226,7 +228,7 @@ impl ScramSha256 {

        let (client_key, server_key) = match password {
            Credentials::Password(password) => {
-                let salt = match base64::decode(parsed.salt) {
+                let salt = match BASE64_STANDARD.decode(parsed.salt) {
                    Ok(salt) => salt,
                    Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
                };
@@ -255,7 +257,7 @@ impl ScramSha256 {
        let mut cbind_input = vec![];
        cbind_input.extend(channel_binding.gs2_header().as_bytes());
        cbind_input.extend(channel_binding.cbind_data());
-        let cbind_input = base64::encode(&cbind_input);
+        let cbind_input = BASE64_STANDARD.encode(&cbind_input);

        self.message.clear();
        write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap();
@@ -272,7 +274,12 @@ impl ScramSha256 {
            *proof ^= signature;
        }

-        write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap();
+        write!(
+            &mut self.message,
+            ",p={}",
+            BASE64_STANDARD.encode(client_proof)
+        )
+        .unwrap();

        self.state = State::Finish {
            server_key,
@@ -306,7 +313,7 @@ impl ScramSha256 {
            ServerFinalMessage::Verifier(verifier) => verifier,
        };

-        let verifier = match base64::decode(verifier) {
+        let verifier = match BASE64_STANDARD.decode(verifier) {
            Ok(verifier) => verifier,
            Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
        };
--- a/libs/proxy/postgres-protocol2/src/password/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -6,6 +6,8 @@
 //! side. This is good because it ensures the cleartext password won't
 //! end up in logs pg_stat displays, etc.

+use base64::Engine as _;
+use base64::prelude::BASE64_STANDARD;
 use hmac::{Hmac, Mac};
 use rand::RngCore;
 use sha2::digest::FixedOutput;
@@ -83,8 +85,8 @@ pub(crate) async fn scram_sha_256_salt(
    format!(
        "SCRAM-SHA-256${}:{}${}:{}",
        SCRAM_DEFAULT_ITERATIONS,
-        base64::encode(salt),
-        base64::encode(stored_key),
-        base64::encode(server_key)
+        BASE64_STANDARD.encode(salt),
+        BASE64_STANDARD.encode(stored_key),
+        BASE64_STANDARD.encode(server_key)
    )
 }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -824,6 +824,7 @@ impl RemoteStorage for AzureBlobStorage {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: &CancellationToken,
+        _complexity_limit: Option<NonZeroU32>,
    ) -> Result<(), TimeTravelError> {
        let msg = "PLEASE NOTE: Azure Blob storage time-travel recovery may not work as expected "
            .to_string()
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -87,6 +87,28 @@ pub enum RemoteStorageKind {
    AzureContainer(AzureConfig),
 }

+#[derive(Deserialize)]
+#[serde(tag = "type")]
+/// Version of RemoteStorageKind which deserializes with type: LocalFs | AwsS3 | AzureContainer
+/// Needed for endpoint storage service
+pub enum TypedRemoteStorageKind {
+    LocalFs { local_path: Utf8PathBuf },
+    AwsS3(S3Config),
+    AzureContainer(AzureConfig),
+}
+
+impl From<TypedRemoteStorageKind> for RemoteStorageKind {
+    fn from(value: TypedRemoteStorageKind) -> Self {
+        match value {
+            TypedRemoteStorageKind::LocalFs { local_path } => {
+                RemoteStorageKind::LocalFs { local_path }
+            }
+            TypedRemoteStorageKind::AwsS3(v) => RemoteStorageKind::AwsS3(v),
+            TypedRemoteStorageKind::AzureContainer(v) => RemoteStorageKind::AzureContainer(v),
+        }
+    }
+}
+
 /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
 #[derive(Clone, PartialEq, Eq, Deserialize, Serialize)]
 pub struct S3Config {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -31,6 +31,7 @@ use anyhow::Context;
 pub use azure_core::Etag;
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
+pub use config::TypedRemoteStorageKind;
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 use futures::StreamExt;
 use futures::stream::Stream;
@@ -440,6 +441,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: &CancellationToken,
+        complexity_limit: Option<NonZeroU32>,
    ) -> Result<(), TimeTravelError>;
 }

@@ -651,22 +653,23 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: &CancellationToken,
+        complexity_limit: Option<NonZeroU32>,
    ) -> Result<(), TimeTravelError> {
        match self {
            Self::LocalFs(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
                    .await
            }
            Self::AwsS3(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
                    .await
            }
            Self::AzureBlob(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
                    .await
            }
            Self::Unreliable(s) => {
-                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
                    .await
            }
        }
@@ -674,6 +677,15 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 }

 impl GenericRemoteStorage {
+    pub async fn from_storage_kind(kind: TypedRemoteStorageKind) -> anyhow::Result<Self> {
+        Self::from_config(&RemoteStorageConfig {
+            storage: kind.into(),
+            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+            small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
+        })
+        .await
+    }
+
    pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        let timeout = storage_config.timeout;

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -610,6 +610,7 @@ impl RemoteStorage for LocalFs {
        _timestamp: SystemTime,
        _done_if_after: SystemTime,
        _cancel: &CancellationToken,
+        _complexity_limit: Option<NonZeroU32>,
    ) -> Result<(), TimeTravelError> {
        Err(TimeTravelError::Unimplemented)
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -981,22 +981,16 @@ impl RemoteStorage for S3Bucket {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: &CancellationToken,
+        complexity_limit: Option<NonZeroU32>,
    ) -> Result<(), TimeTravelError> {
        let kind = RequestKind::TimeTravel;
        let permit = self.permit(kind, cancel).await?;

        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");

-        // Limit the number of versions deletions, mostly so that we don't
-        // keep requesting forever if the list is too long, as we'd put the
-        // list in RAM.
-        // Building a list of 100k entries that reaches the limit roughly takes
-        // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
-        const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
-
        let mode = ListingMode::NoDelimiter;
        let version_listing = self
-            .list_versions_with_permit(&permit, prefix, mode, COMPLEXITY_LIMIT, cancel)
+            .list_versions_with_permit(&permit, prefix, mode, complexity_limit, cancel)
            .await
            .map_err(|err| match err {
                DownloadError::Other(e) => TimeTravelError::Other(e),
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -240,11 +240,12 @@ impl RemoteStorage for UnreliableWrapper {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: &CancellationToken,
+        complexity_limit: Option<NonZeroU32>,
    ) -> Result<(), TimeTravelError> {
        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
            .map_err(TimeTravelError::Other)?;
        self.inner
-            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
+            .time_travel_recover(prefix, timestamp, done_if_after, cancel, complexity_limit)
            .await
    }
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -157,7 +157,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // No changes after recovery to t2 (no-op)
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t2, t_final, &cancel)
+        .time_travel_recover(None, t2, t_final, &cancel, None)
        .await?;
    let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t2: {t2_files_recovered:?}");
@@ -173,7 +173,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // after recovery to t1: path1 is back, path2 has the old content
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t1, t_final, &cancel)
+        .time_travel_recover(None, t1, t_final, &cancel, None)
        .await?;
    let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t1: {t1_files_recovered:?}");
@@ -189,7 +189,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // after recovery to t0: everything is gone except for path1
    let t_final = time_point().await;
    ctx.client
-        .time_travel_recover(None, t0, t_final, &cancel)
+        .time_travel_recover(None, t0, t_final, &cancel, None)
        .await?;
    let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t0: {t0_files_recovered:?}");
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -311,7 +311,7 @@ extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
    }
 }

-extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
+unsafe extern "C-unwind" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) -> ! {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -144,7 +144,7 @@ pub trait ApiImpl {
        todo!()
    }

-    fn finish_sync_safekeepers(&self, _lsn: u64) {
+    fn finish_sync_safekeepers(&self, _lsn: u64) -> ! {
        todo!()
    }

@@ -469,7 +469,7 @@ mod tests {
            true
        }

-        fn finish_sync_safekeepers(&self, lsn: u64) {
+        fn finish_sync_safekeepers(&self, lsn: u64) -> ! {
            self.sync_channel.send(lsn).unwrap();
            panic!("sync safekeepers finished at lsn={}", lsn);
        }
--- a/local_proxy.json
+++ b/local_proxy.json
@@ -0,0 +1,11 @@
+{
+    "jwks": [
+        {
+            "id": "1",
+            "role_names": ["authenticated"],
+            "jwks_url": "https://adapted-gorilla-88.clerk.accounts.dev/.well-known/jwks.json",
+            "provider_name": "foo",
+            "jwt_audience": null
+        }
+    ]
+}
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -2,7 +2,7 @@ use std::sync::{Arc, Mutex};

 use futures::stream::{SplitSink, SplitStream};
 use futures::{SinkExt, StreamExt};
-use pageserver_api::models::{
+use pageserver_api::pagestream_api::{
    PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
 };
 use pageserver_api::reltag::RelTag;
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -20,7 +20,7 @@
 //!
 //! # local timeline dir
 //! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//!     grep "__" | cargo run --release --bin pagectl draw-timeline > out.svg
 //!
 //! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
 //! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
@@ -81,7 +81,11 @@ fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T
 fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    let split: Vec<&str> = name.split("__").collect();
    let keys: Vec<&str> = split[0].split('-').collect();
-    let mut lsns: Vec<&str> = split[1].split('-').collect();
+
+    // Remove the temporary file extension, e.g., remove the `.d20a.___temp` part from the following filename:
+    // 000000067F000040490000404A00441B0000-000000067F000040490000404A00441B4000__000043483A34CE00.d20a.___temp
+    let lsns = split[1].split('.').collect::<Vec<&str>>()[0];
+    let mut lsns: Vec<&str> = lsns.split('-').collect();

    // The current format of the layer file name: 000000067F0000000400000B150100000000-000000067F0000000400000D350100000000__00000000014B7AC8-v1-00000001

--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -13,7 +13,7 @@ use pageserver::{page_cache, virtual_file};
 use pageserver_api::key::Key;
 use utils::id::{TenantId, TimelineId};

-use crate::layer_map_analyzer::parse_filename;
+use crate::layer_map_analyzer::{LayerFile, parse_filename};

 #[derive(Subcommand)]
 pub(crate) enum LayerCmd {
@@ -38,6 +38,8 @@ pub(crate) enum LayerCmd {
        /// The id from list-layer command
        id: usize,
    },
+    /// Dump all information of a layer file locally
+    DumpLayerLocal { path: PathBuf },
    RewriteSummary {
        layer_file_path: Utf8PathBuf,
        #[clap(long)]
@@ -131,15 +133,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            }

            for (idx, layer_file) in to_print {
-                println!(
-                    "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
-                    idx,
-                    layer_file.key_range.start,
-                    layer_file.key_range.end,
-                    layer_file.lsn_range.start,
-                    layer_file.lsn_range.end,
-                    layer_file.is_delta,
-                );
+                print_layer_file(idx, &layer_file);
            }
            Ok(())
        }
@@ -159,16 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                let layer = layer?;
                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
                    if *id == idx {
-                        // TODO(chi): dedup code
-                        println!(
-                            "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
-                            idx,
-                            layer_file.key_range.start,
-                            layer_file.key_range.end,
-                            layer_file.lsn_range.start,
-                            layer_file.lsn_range.end,
-                            layer_file.is_delta,
-                        );
+                        print_layer_file(idx, &layer_file);

                        if layer_file.is_delta {
                            read_delta_file(layer.path(), &ctx).await?;
@@ -183,6 +168,18 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
            }
            Ok(())
        }
+        LayerCmd::DumpLayerLocal { path } => {
+            if let Ok(layer_file) = parse_filename(path.file_name().unwrap().to_str().unwrap()) {
+                print_layer_file(0, &layer_file);
+
+                if layer_file.is_delta {
+                    read_delta_file(path, &ctx).await?;
+                } else {
+                    read_image_file(path, &ctx).await?;
+                }
+            }
+            Ok(())
+        }
        LayerCmd::RewriteSummary {
            layer_file_path,
            new_tenant_id,
@@ -247,3 +244,15 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
        }
    }
 }
+
+fn print_layer_file(idx: usize, layer_file: &LayerFile) {
+    println!(
+        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+        idx,
+        layer_file.key_range.start,
+        layer_file.key_range.end,
+        layer_file.lsn_range.start,
+        layer_file.lsn_range.end,
+        layer_file.is_delta,
+    );
+}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -176,9 +176,11 @@ async fn main() -> anyhow::Result<()> {
            let config = RemoteStorageConfig::from_toml_str(&cmd.config_toml_str)?;
            let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
            let cancel = CancellationToken::new();
+            // Complexity limit: as we are running this command locally, we should have a lot of memory available, and we do not
+            // need to limit the number of versions we are going to delete.
            storage
                .unwrap()
-                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel)
+                .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel, None)
                .await?;
        }
        Commands::Key(dkc) => dkc.execute(),
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -5,11 +5,14 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+anyhow.workspace = true
 bytes.workspace = true
+futures.workspace = true
 pageserver_api.workspace = true
 postgres_ffi.workspace = true
 prost.workspace = true
 thiserror.workspace = true
+tokio.workspace = true
 tonic.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -102,12 +102,14 @@ message CheckRelExistsResponse {
  bool exists = 1;
 }

-// Requests a base backup at a given LSN.
+// Requests a base backup.
 message GetBaseBackupRequest {
-  // The LSN to fetch a base backup at.
-  ReadLsn read_lsn = 1;
+  // The LSN to fetch the base backup at. 0 or absent means the latest LSN known to the Pageserver.
+  uint64 lsn = 1;
  // If true, logical replication slots will not be created.
  bool replica = 2;
+  // If true, include relation files in the base backup. Mainly for debugging and tests.
+  bool full = 3;
 }

 // Base backup response chunk, returned as an ordered stream.
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -0,0 +1,191 @@
+use std::convert::TryInto;
+
+use bytes::Bytes;
+use futures::TryStreamExt;
+use futures::{Stream, StreamExt};
+use tonic::metadata::AsciiMetadataValue;
+use tonic::metadata::errors::InvalidMetadataValue;
+use tonic::transport::Channel;
+use tonic::{Request, Streaming};
+
+use utils::id::TenantId;
+use utils::id::TimelineId;
+use utils::shard::ShardIndex;
+
+use anyhow::Result;
+
+use crate::model;
+use crate::proto;
+
+///
+/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
+/// headers are required at the pageserver.
+///
+#[derive(Clone)]
+struct AuthInterceptor {
+    tenant_id: AsciiMetadataValue,
+    timeline_id: AsciiMetadataValue,
+    shard_id: AsciiMetadataValue,
+    auth_header: Option<AsciiMetadataValue>, // including "Bearer " prefix
+}
+
+impl AuthInterceptor {
+    fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        auth_token: Option<String>,
+        shard_id: ShardIndex,
+    ) -> Result<Self, InvalidMetadataValue> {
+        let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
+        let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
+        let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
+
+        let auth_header: Option<AsciiMetadataValue> = match auth_token {
+            Some(token) => Some(format!("Bearer {token}").try_into()?),
+            None => None,
+        };
+
+        Ok(Self {
+            tenant_id: tenant_ascii,
+            shard_id: shard_ascii,
+            timeline_id: timeline_ascii,
+            auth_header,
+        })
+    }
+}
+
+impl tonic::service::Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
+        req.metadata_mut()
+            .insert("neon-tenant-id", self.tenant_id.clone());
+        req.metadata_mut()
+            .insert("neon-shard-id", self.shard_id.clone());
+        req.metadata_mut()
+            .insert("neon-timeline-id", self.timeline_id.clone());
+        if let Some(auth_header) = &self.auth_header {
+            req.metadata_mut()
+                .insert("authorization", auth_header.clone());
+        }
+        Ok(req)
+    }
+}
+#[derive(Clone)]
+pub struct Client {
+    client: proto::PageServiceClient<
+        tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
+    >,
+}
+
+impl Client {
+    pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
+        into_endpoint: T,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_header: Option<String>,
+    ) -> anyhow::Result<Self> {
+        let endpoint: tonic::transport::Endpoint = into_endpoint
+            .try_into()
+            .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
+        let channel = endpoint.connect().await?;
+        let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
+            .map_err(|e| anyhow::anyhow!(e.to_string()))?;
+        let client = proto::PageServiceClient::with_interceptor(channel, auth);
+
+        Ok(Self { client })
+    }
+
+    /// Returns whether a relation exists.
+    pub async fn check_rel_exists(
+        &mut self,
+        req: model::CheckRelExistsRequest,
+    ) -> Result<model::CheckRelExistsResponse, tonic::Status> {
+        let proto_req = proto::CheckRelExistsRequest::from(req);
+
+        let response = self.client.check_rel_exists(proto_req).await?;
+
+        let proto_resp = response.into_inner();
+        Ok(proto_resp.into())
+    }
+
+    /// Fetches a base backup.
+    pub async fn get_base_backup(
+        &mut self,
+        req: model::GetBaseBackupRequest,
+    ) -> Result<impl Stream<Item = Result<Bytes, tonic::Status>>, tonic::Status> {
+        let proto_req = proto::GetBaseBackupRequest::from(req);
+
+        let response_stream: Streaming<proto::GetBaseBackupResponseChunk> =
+            self.client.get_base_backup(proto_req).await?.into_inner();
+
+        // TODO: Consider dechunking internally
+        let domain_stream = response_stream.map(|chunk_res| {
+            chunk_res.and_then(|proto_chunk| {
+                proto_chunk.try_into().map_err(|e| {
+                    tonic::Status::internal(format!("Failed to convert response chunk: {}", e))
+                })
+            })
+        });
+
+        Ok(domain_stream)
+    }
+
+    /// Returns the total size of a database, as # of bytes.
+    pub async fn get_db_size(
+        &mut self,
+        req: model::GetDbSizeRequest,
+    ) -> Result<u64, tonic::Status> {
+        let proto_req = proto::GetDbSizeRequest::from(req);
+
+        let response = self.client.get_db_size(proto_req).await?;
+        Ok(response.into_inner().into())
+    }
+
+    /// Fetches pages.
+    ///
+    /// This is implemented as a bidirectional streaming RPC for performance.
+    /// Per-request errors are often returned as status_code instead of errors,
+    /// to avoid tearing down the entire stream via tonic::Status.
+    pub async fn get_pages<ReqSt>(
+        &mut self,
+        inbound: ReqSt,
+    ) -> Result<
+        impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
+        tonic::Status,
+    >
+    where
+        ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
+    {
+        let outbound_proto = inbound.map(|domain_req| domain_req.into());
+
+        let req_new = Request::new(outbound_proto);
+
+        let response_stream: Streaming<proto::GetPageResponse> =
+            self.client.get_pages(req_new).await?.into_inner();
+
+        let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
+
+        Ok(domain_stream)
+    }
+
+    /// Returns the size of a relation, as # of blocks.
+    pub async fn get_rel_size(
+        &mut self,
+        req: model::GetRelSizeRequest,
+    ) -> Result<model::GetRelSizeResponse, tonic::Status> {
+        let proto_req = proto::GetRelSizeRequest::from(req);
+        let response = self.client.get_rel_size(proto_req).await?;
+        let proto_resp = response.into_inner();
+        Ok(proto_resp.into())
+    }
+
+    /// Fetches an SLRU segment.
+    pub async fn get_slru_segment(
+        &mut self,
+        req: model::GetSlruSegmentRequest,
+    ) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
+        let proto_req = proto::GetSlruSegmentRequest::from(req);
+        let response = self.client.get_slru_segment(proto_req).await?;
+        Ok(response.into_inner().try_into()?)
+    }
+}
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -18,6 +18,8 @@ pub mod proto {
    pub use page_service_server::{PageService, PageServiceServer};
 }

+mod client;
+pub use client::Client;
 mod model;

 pub use model::*;
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -26,7 +26,7 @@ use utils::lsn::Lsn;
 use crate::proto;

 /// A protocol error. Typically returned via try_from() or try_into().
-#[derive(thiserror::Error, Debug)]
+#[derive(thiserror::Error, Clone, Debug)]
 pub enum ProtocolError {
    #[error("field '{0}' has invalid value '{1}'")]
    Invalid(&'static str, String),
@@ -182,34 +182,33 @@ impl From<CheckRelExistsResponse> for proto::CheckRelExistsResponse {
    }
 }

-/// Requests a base backup at a given LSN.
+/// Requests a base backup.
 #[derive(Clone, Copy, Debug)]
 pub struct GetBaseBackupRequest {
-    /// The LSN to fetch a base backup at.
-    pub read_lsn: ReadLsn,
+    /// The LSN to fetch a base backup at. If None, uses the latest LSN known to the Pageserver.
+    pub lsn: Option<Lsn>,
    /// If true, logical replication slots will not be created.
    pub replica: bool,
+    /// If true, include relation files in the base backup. Mainly for debugging and tests.
+    pub full: bool,
 }

-impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
-    type Error = ProtocolError;
-
-    fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
-        Ok(Self {
-            read_lsn: pb
-                .read_lsn
-                .ok_or(ProtocolError::Missing("read_lsn"))?
-                .try_into()?,
+impl From<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
+    fn from(pb: proto::GetBaseBackupRequest) -> Self {
+        Self {
+            lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
            replica: pb.replica,
-        })
+            full: pb.full,
+        }
    }
 }

 impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
    fn from(request: GetBaseBackupRequest) -> Self {
        Self {
-            read_lsn: Some(request.read_lsn.into()),
+            lsn: request.lsn.unwrap_or_default().0,
            replica: request.replica,
+            full: request.full,
        }
    }
 }
@@ -422,6 +421,39 @@ impl From<GetPageResponse> for proto::GetPageResponse {
    }
 }

+impl GetPageResponse {
+    /// Attempts to represent a tonic::Status as a GetPageResponse if appropriate. Returning a
+    /// tonic::Status will terminate the GetPage stream, so per-request errors are emitted as a
+    /// GetPageResponse with a non-OK status code instead.
+    #[allow(clippy::result_large_err)]
+    pub fn try_from_status(
+        status: tonic::Status,
+        request_id: RequestID,
+    ) -> Result<Self, tonic::Status> {
+        // We shouldn't see an OK status here, because we're emitting an error.
+        debug_assert_ne!(status.code(), tonic::Code::Ok);
+        if status.code() == tonic::Code::Ok {
+            return Err(tonic::Status::internal(format!(
+                "unexpected OK status: {status:?}",
+            )));
+        }
+
+        // If we can't convert the tonic::Code to a GetPageStatusCode, this is not a per-request
+        // error and we should return a tonic::Status to terminate the stream.
+        let Ok(status_code) = status.code().try_into() else {
+            return Err(status);
+        };
+
+        // Return a GetPageResponse for the status.
+        Ok(Self {
+            request_id,
+            status_code,
+            reason: Some(status.message().to_string()),
+            page_images: Vec::new(),
+        })
+    }
+}
+
 /// A GetPage response status code.
 ///
 /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
@@ -485,8 +517,42 @@ impl From<GetPageStatusCode> for i32 {
    }
 }

+impl TryFrom<tonic::Code> for GetPageStatusCode {
+    type Error = tonic::Code;
+
+    fn try_from(code: tonic::Code) -> Result<Self, Self::Error> {
+        use tonic::Code;
+
+        let status_code = match code {
+            Code::Ok => Self::Ok,
+
+            // These are per-request errors, which should be returned as GetPageResponses.
+            Code::AlreadyExists => Self::InvalidRequest,
+            Code::DataLoss => Self::InternalError,
+            Code::FailedPrecondition => Self::InvalidRequest,
+            Code::InvalidArgument => Self::InvalidRequest,
+            Code::Internal => Self::InternalError,
+            Code::NotFound => Self::NotFound,
+            Code::OutOfRange => Self::InvalidRequest,
+            Code::ResourceExhausted => Self::SlowDown,
+
+            // These should terminate the stream by returning a tonic::Status.
+            Code::Aborted
+            | Code::Cancelled
+            | Code::DeadlineExceeded
+            | Code::PermissionDenied
+            | Code::Unauthenticated
+            | Code::Unavailable
+            | Code::Unimplemented
+            | Code::Unknown => return Err(code),
+        };
+        Ok(status_code)
+    }
+}
+
 // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
 // shards will error.
+#[derive(Clone, Copy, Debug)]
 pub struct GetRelSizeRequest {
    pub read_lsn: ReadLsn,
    pub rel: RelTag,
@@ -530,6 +596,7 @@ impl From<GetRelSizeResponse> for proto::GetRelSizeResponse {
 }

 /// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+#[derive(Clone, Copy, Debug)]
 pub struct GetSlruSegmentRequest {
    pub read_lsn: ReadLsn,
    pub kind: SlruKind,
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -12,7 +12,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
-use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
+use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
 use pageserver_page_api::proto;
--- a/pageserver/src/basebackup_cache.rs
+++ b/pageserver/src/basebackup_cache.rs
@@ -1,5 +1,6 @@
 use std::{collections::HashMap, sync::Arc};

+use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use camino::{Utf8Path, Utf8PathBuf};
 use metrics::core::{AtomicU64, GenericCounter};
@@ -18,7 +19,10 @@ use utils::{
 use crate::{
    basebackup::send_basebackup_tarball,
    context::{DownloadBehavior, RequestContext},
-    metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
+    metrics::{
+        BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ,
+        BASEBACKUP_CACHE_SIZE,
+    },
    task_mgr::TaskKind,
    tenant::{
        Timeline,
@@ -35,8 +39,13 @@ pub struct BasebackupPrepareRequest {
 pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
 pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;

-type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
-type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
+#[derive(Clone)]
+struct CacheEntry {
+    /// LSN at which the basebackup was taken.
+    lsn: Lsn,
+    /// Size of the basebackup archive in bytes.
+    size_bytes: u64,
+}

 /// BasebackupCache stores cached basebackup archives for timelines on local disk.
 ///
@@ -52,21 +61,12 @@ type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
 /// and ~1 RPS for get requests.
 pub struct BasebackupCache {
    data_dir: Utf8PathBuf,
-    config: BasebackupCacheConfig,
-    tenant_manager: Arc<TenantManager>,
-    remove_entry_sender: BasebackupRemoveEntrySender,

-    entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
-
-    cancel: CancellationToken,
+    entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,

    read_hit_count: GenericCounter<AtomicU64>,
    read_miss_count: GenericCounter<AtomicU64>,
    read_err_count: GenericCounter<AtomicU64>,
-
-    prepare_ok_count: GenericCounter<AtomicU64>,
-    prepare_skip_count: GenericCounter<AtomicU64>,
-    prepare_err_count: GenericCounter<AtomicU64>,
 }

 impl BasebackupCache {
@@ -82,35 +82,32 @@ impl BasebackupCache {
        tenant_manager: Arc<TenantManager>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
-        let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
-
-        let enabled = config.is_some();
-
        let cache = Arc::new(BasebackupCache {
            data_dir,
-            config: config.unwrap_or_default(),
-            tenant_manager,
-            remove_entry_sender,

            entries: std::sync::Mutex::new(HashMap::new()),

-            cancel,
-
            read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
            read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
            read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
-
-            prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
-            prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
-            prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
        });

-        if enabled {
-            runtime_handle.spawn(
-                cache
-                    .clone()
-                    .background(prepare_receiver, remove_entry_receiver),
-            );
+        if let Some(config) = config {
+            let background = BackgroundTask {
+                c: cache.clone(),
+
+                config,
+                tenant_manager,
+                cancel,
+
+                entry_count: 0,
+                total_size_bytes: 0,
+
+                prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
+                prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
+                prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
+            };
+            runtime_handle.spawn(background.run(prepare_receiver));
        }

        cache
@@ -128,7 +125,7 @@ impl BasebackupCache {
    ) -> Option<tokio::fs::File> {
        // Fast path. Check if the entry exists using the in-memory state.
        let tti = TenantTimelineId::new(tenant_id, timeline_id);
-        if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
+        if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
            self.read_miss_count.inc();
            return None;
        }
@@ -166,6 +163,42 @@ impl BasebackupCache {
        self.data_dir
            .join(Self::entry_filename(tenant_id, timeline_id, lsn))
    }
+}
+
+/// The background task that does the job to prepare basebackups
+/// and manage the cache entries on disk.
+/// It is a separate struct from BasebackupCache to allow holding
+/// a mutable reference to this state without a mutex lock,
+/// while BasebackupCache is referenced by the clients.
+struct BackgroundTask {
+    c: Arc<BasebackupCache>,
+
+    config: BasebackupCacheConfig,
+    tenant_manager: Arc<TenantManager>,
+    cancel: CancellationToken,
+
+    /// Number of the entries in the cache.
+    /// This counter is used for metrics and applying cache limits.
+    /// It generally should be equal to c.entries.len(), but it's calculated
+    /// pessimistically for abnormal situations: if we encountered some errors
+    /// during removing the entry from disk, we won't decrement this counter to
+    /// make sure that we don't exceed the limit with "trashed" files on the disk.
+    /// It will also count files in the data_dir that are not valid cache entries.
+    entry_count: usize,
+    /// Total size of all the entries on the disk.
+    /// This counter is used for metrics and applying cache limits.
+    /// Similar to entry_count, it is calculated pessimistically for abnormal situations.
+    total_size_bytes: u64,
+
+    prepare_ok_count: GenericCounter<AtomicU64>,
+    prepare_skip_count: GenericCounter<AtomicU64>,
+    prepare_err_count: GenericCounter<AtomicU64>,
+}
+
+impl BackgroundTask {
+    fn tmp_dir(&self) -> Utf8PathBuf {
+        self.c.data_dir.join("tmp")
+    }

    fn entry_tmp_path(
        &self,
@@ -173,9 +206,8 @@ impl BasebackupCache {
        timeline_id: TimelineId,
        lsn: Lsn,
    ) -> Utf8PathBuf {
-        self.data_dir
-            .join("tmp")
-            .join(Self::entry_filename(tenant_id, timeline_id, lsn))
+        self.tmp_dir()
+            .join(BasebackupCache::entry_filename(tenant_id, timeline_id, lsn))
    }

    fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
@@ -194,18 +226,21 @@ impl BasebackupCache {
        Some((tenant_id, timeline_id, lsn))
    }

-    async fn cleanup(&self) -> anyhow::Result<()> {
-        // Cleanup tmp directory.
-        let tmp_dir = self.data_dir.join("tmp");
-        let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
-        while let Some(dir_entry) = tmp_dir.next_entry().await? {
-            if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
-                tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
-            }
+    // Recreate the tmp directory to clear all files in it.
+    async fn clean_tmp_dir(&self) -> anyhow::Result<()> {
+        let tmp_dir = self.tmp_dir();
+        if tmp_dir.exists() {
+            tokio::fs::remove_dir_all(&tmp_dir).await?;
        }
+        tokio::fs::create_dir_all(&tmp_dir).await?;
+        Ok(())
+    }

-        // Remove outdated entries.
-        let entries_old = self.entries.lock().unwrap().clone();
+    async fn cleanup(&mut self) -> anyhow::Result<()> {
+        self.clean_tmp_dir().await?;
+
+        // Leave only up-to-date entries.
+        let entries_old = self.c.entries.lock().unwrap().clone();
        let mut entries_new = HashMap::new();
        for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
            if !tenant_shard_id.is_shard_zero() {
@@ -218,43 +253,42 @@ impl BasebackupCache {

            for timeline in tenant.list_timelines() {
                let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
-                if let Some(&entry_lsn) = entries_old.get(&tti) {
-                    if timeline.get_last_record_lsn() <= entry_lsn {
-                        entries_new.insert(tti, entry_lsn);
+                if let Some(entry) = entries_old.get(&tti) {
+                    if timeline.get_last_record_lsn() <= entry.lsn {
+                        entries_new.insert(tti, entry.clone());
                    }
                }
            }
        }

-        for (&tti, &lsn) in entries_old.iter() {
+        // Try to remove all entries that are not up-to-date.
+        for (&tti, entry) in entries_old.iter() {
            if !entries_new.contains_key(&tti) {
-                self.remove_entry_sender
-                    .send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
-                    .unwrap();
+                self.try_remove_entry(tti.tenant_id, tti.timeline_id, entry)
+                    .await;
            }
        }

-        BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
-        *self.entries.lock().unwrap() = entries_new;
+        // Note: BackgroundTask is the only writer for self.c.entries,
+        // so it couldn't have been modified concurrently.
+        *self.c.entries.lock().unwrap() = entries_new;

        Ok(())
    }

-    async fn on_startup(&self) -> anyhow::Result<()> {
-        // Create data_dir and tmp directory if they do not exist.
-        tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
+    async fn on_startup(&mut self) -> anyhow::Result<()> {
+        // Create data_dir if it does not exist.
+        tokio::fs::create_dir_all(&self.c.data_dir)
            .await
-            .map_err(|e| {
-                anyhow::anyhow!(
-                    "Failed to create basebackup cache data_dir {:?}: {:?}",
-                    self.data_dir,
-                    e
-                )
-            })?;
+            .context("Failed to create basebackup cache data directory")?;
+
+        self.clean_tmp_dir()
+            .await
+            .context("Failed to clean tmp directory")?;

        // Read existing entries from the data_dir and add them to in-memory state.
-        let mut entries = HashMap::new();
-        let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
+        let mut entries = HashMap::<TenantTimelineId, CacheEntry>::new();
+        let mut dir = tokio::fs::read_dir(&self.c.data_dir).await?;
        while let Some(dir_entry) = dir.next_entry().await? {
            let filename = dir_entry.file_name();

@@ -263,33 +297,43 @@ impl BasebackupCache {
                continue;
            }

+            let size_bytes = dir_entry
+                .metadata()
+                .await
+                .map_err(|e| {
+                    anyhow::anyhow!("Failed to read metadata for file {:?}: {:?}", filename, e)
+                })?
+                .len();
+
+            self.entry_count += 1;
+            BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
+
+            self.total_size_bytes += size_bytes;
+            BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
+
            let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
            let Some((tenant_id, timeline_id, lsn)) = parsed else {
                tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
                continue;
            };

+            let cur_entry = CacheEntry { lsn, size_bytes };
+
            let tti = TenantTimelineId::new(tenant_id, timeline_id);

            use std::collections::hash_map::Entry::*;

            match entries.entry(tti) {
                Occupied(mut entry) => {
-                    let entry_lsn = *entry.get();
+                    let found_entry = entry.get();
                    // Leave only the latest entry, remove the old one.
-                    if lsn < entry_lsn {
-                        self.remove_entry_sender.send(self.entry_path(
-                            tenant_id,
-                            timeline_id,
-                            lsn,
-                        ))?;
-                    } else if lsn > entry_lsn {
-                        self.remove_entry_sender.send(self.entry_path(
-                            tenant_id,
-                            timeline_id,
-                            entry_lsn,
-                        ))?;
-                        entry.insert(lsn);
+                    if cur_entry.lsn < found_entry.lsn {
+                        self.try_remove_entry(tenant_id, timeline_id, &cur_entry)
+                            .await;
+                    } else if cur_entry.lsn > found_entry.lsn {
+                        self.try_remove_entry(tenant_id, timeline_id, found_entry)
+                            .await;
+                        entry.insert(cur_entry);
                    } else {
                        // Two different filenames parsed to the same timline_id and LSN.
                        // Should never happen.
@@ -300,22 +344,17 @@ impl BasebackupCache {
                    }
                }
                Vacant(entry) => {
-                    entry.insert(lsn);
+                    entry.insert(cur_entry);
                }
            }
        }

-        BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
-        *self.entries.lock().unwrap() = entries;
+        *self.c.entries.lock().unwrap() = entries;

        Ok(())
    }

-    async fn background(
-        self: Arc<Self>,
-        mut prepare_receiver: BasebackupPrepareReceiver,
-        mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
-    ) {
+    async fn run(mut self, mut prepare_receiver: BasebackupPrepareReceiver) {
        // Panic in the background is a safe fallback.
        // It will drop receivers and the cache will be effectively disabled.
        self.on_startup()
@@ -338,11 +377,6 @@ impl BasebackupCache {
                        continue;
                    }
                }
-                Some(req) = remove_entry_receiver.recv() => {
-                    if let Err(e) = tokio::fs::remove_file(req).await {
-                        tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
-                    }
-                }
                _ = cleanup_ticker.tick() => {
                    self.cleanup().await.unwrap_or_else(|e| {
                        tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
@@ -356,6 +390,67 @@ impl BasebackupCache {
        }
    }

+    /// Try to remove an entry from disk.
+    /// The caller is responsible for removing the entry from the in-memory state.
+    /// Updates size counters and corresponding metrics.
+    /// Ignores the filesystem errors as not-so-important, but the size counters
+    /// are not decremented in this case, so the file will continue to be counted
+    /// towards the size limits.
+    async fn try_remove_entry(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        entry: &CacheEntry,
+    ) {
+        let entry_path = self.c.entry_path(tenant_id, timeline_id, entry.lsn);
+
+        match tokio::fs::remove_file(&entry_path).await {
+            Ok(_) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => {
+                tracing::warn!(
+                    "Failed to remove basebackup cache file for tenant {} timeline {} LSN {}: {:#}",
+                    tenant_id,
+                    timeline_id,
+                    entry.lsn,
+                    e
+                );
+                return;
+            }
+        }
+
+        self.entry_count -= 1;
+        BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
+
+        self.total_size_bytes -= entry.size_bytes;
+        BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
+    }
+
+    /// Insert the cache entry into in-memory state and update the size counters.
+    /// Assumes that the file for the entry already exists on disk.
+    /// If the entry already exists with previous LSN, it will be removed.
+    async fn upsert_entry(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        entry: CacheEntry,
+    ) {
+        let tti = TenantTimelineId::new(tenant_id, timeline_id);
+
+        self.entry_count += 1;
+        BASEBACKUP_CACHE_ENTRIES.set(self.entry_count as u64);
+
+        self.total_size_bytes += entry.size_bytes;
+        BASEBACKUP_CACHE_SIZE.set(self.total_size_bytes);
+
+        let old_entry = self.c.entries.lock().unwrap().insert(tti, entry);
+
+        if let Some(old_entry) = old_entry {
+            self.try_remove_entry(tenant_id, timeline_id, &old_entry)
+                .await;
+        }
+    }
+
    /// Prepare a basebackup for the given timeline.
    ///
    /// If the basebackup already exists with a higher LSN or the timeline already
@@ -364,7 +459,7 @@ impl BasebackupCache {
    /// The basebackup is prepared in a temporary directory and then moved to the final
    /// location to make the operation atomic.
    async fn prepare_basebackup(
-        &self,
+        &mut self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        req_lsn: Lsn,
@@ -378,30 +473,44 @@ impl BasebackupCache {

        let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);

+        // TODO(diko): I don't think we will hit the limit,
+        // but if we do, it makes sense to try to evict oldest entries. here
+        if self.entry_count >= self.config.max_size_entries {
+            tracing::info!(
+                %tenant_shard_id,
+                %timeline_id,
+                %req_lsn,
+                "Basebackup cache is full (max_size_entries), skipping basebackup",
+            );
+            self.prepare_skip_count.inc();
+            return Ok(());
+        }
+
+        if self.total_size_bytes >= self.config.max_total_size_bytes {
+            tracing::info!(
+                %tenant_shard_id,
+                %timeline_id,
+                %req_lsn,
+                "Basebackup cache is full (max_total_size_bytes), skipping basebackup",
+            );
+            self.prepare_skip_count.inc();
+            return Ok(());
+        }
+
        {
-            let entries = self.entries.lock().unwrap();
-            if let Some(&entry_lsn) = entries.get(&tti) {
-                if entry_lsn >= req_lsn {
+            let entries = self.c.entries.lock().unwrap();
+            if let Some(entry) = entries.get(&tti) {
+                if entry.lsn >= req_lsn {
                    tracing::info!(
                        %timeline_id,
                        %req_lsn,
-                        %entry_lsn,
+                        %entry.lsn,
                        "Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
                    );
                    self.prepare_skip_count.inc();
                    return Ok(());
                }
            }
-
-            if entries.len() as i64 >= self.config.max_size_entries {
-                tracing::info!(
-                    %timeline_id,
-                    %req_lsn,
-                    "Basebackup cache is full, skipping basebackup",
-                );
-                self.prepare_skip_count.inc();
-                return Ok(());
-            }
        }

        let tenant = self
@@ -437,47 +546,52 @@ impl BasebackupCache {
            .prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
            .await;

-        if let Err(err) = res {
-            tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
-            // Try to clean up tmp file. If we fail, the background clean up task will take care of it.
-            match tokio::fs::remove_file(&entry_tmp_path).await {
-                Ok(_) => {}
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
-                Err(e) => {
-                    tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
+        let entry = match res {
+            Ok(entry) => entry,
+            Err(err) => {
+                tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
+                // Try to clean up tmp file. If we fail, the background clean up task will take care of it.
+                match tokio::fs::remove_file(&entry_tmp_path).await {
+                    Ok(_) => {}
+                    Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+                    Err(e) => {
+                        tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
+                    }
                }
+                return Err(err);
            }
-            return Err(err);
-        }
+        };

        // Move the tmp file to the final location atomically.
-        let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
+        // The tmp file is fsynced, so it's guaranteed that we will not have a partial file
+        // in the main directory.
+        // It's not necessary to fsync the inode after renaming, because the worst case is that
+        // the rename operation will be rolled back on the disk failure, the entry will disappear
+        // from the main directory, and the entry access will cause a cache miss.
+        let entry_path = self
+            .c
+            .entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
        tokio::fs::rename(&entry_tmp_path, &entry_path).await?;

-        let mut entries = self.entries.lock().unwrap();
-        if let Some(old_lsn) = entries.insert(tti, req_lsn) {
-            // Remove the old entry if it exists.
-            self.remove_entry_sender
-                .send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
-                .unwrap();
-        }
-        BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
+        self.upsert_entry(tenant_shard_id.tenant_id, timeline_id, entry)
+            .await;

        self.prepare_ok_count.inc();
        Ok(())
    }

    /// Prepares a basebackup in a temporary file.
+    /// Guarantees that the tmp file is fsynced before returning.
    async fn prepare_basebackup_tmp(
        &self,
-        emptry_tmp_path: &Utf8Path,
+        entry_tmp_path: &Utf8Path,
        timeline: &Arc<Timeline>,
        req_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<CacheEntry> {
        let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
        let ctx = ctx.with_scope_timeline(timeline);

-        let file = tokio::fs::File::create(emptry_tmp_path).await?;
+        let file = tokio::fs::File::create(entry_tmp_path).await?;
        let mut writer = BufWriter::new(file);

        let mut encoder = GzipEncoder::with_quality(
@@ -513,6 +627,12 @@ impl BasebackupCache {
        writer.flush().await?;
        writer.into_inner().sync_all().await?;

-        Ok(())
+        // TODO(diko): we can count it via Writer wrapper instead of a syscall.
+        let size_bytes = tokio::fs::metadata(entry_tmp_path).await?.len();
+
+        Ok(CacheEntry {
+            lsn: req_lsn,
+            size_bytes,
+        })
    }
 }
--- a/pageserver/src/bin/test_helper_slow_client_reads.rs
+++ b/pageserver/src/bin/test_helper_slow_client_reads.rs
@@ -2,7 +2,9 @@ use std::io::{Read, Write, stdin, stdout};
 use std::time::Duration;

 use clap::Parser;
-use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
+use pageserver_api::pagestream_api::{
+    PagestreamFeMessage, PagestreamRequest, PagestreamTestRequest,
+};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -28,17 +30,15 @@ async fn main() -> anyhow::Result<()> {
    let mut msg = 0;
    loop {
        msg += 1;
-        let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
-            PagestreamTestRequest {
-                hdr: PagestreamRequest {
-                    reqid: 0,
-                    request_lsn: Lsn(23),
-                    not_modified_since: Lsn(23),
-                },
-                batch_key: 42,
-                message: format!("message {}", msg),
+        let fut = sender.send(PagestreamFeMessage::Test(PagestreamTestRequest {
+            hdr: PagestreamRequest {
+                reqid: 0,
+                request_lsn: Lsn(23),
+                not_modified_since: Lsn(23),
            },
-        ));
+            batch_key: 42,
+            message: format!("message {}", msg),
+        }));
        let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
            eprintln!("pipe seems full");
            break;
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -159,14 +159,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                Ok(m) => {
                    // Since we run one time at startup, be generous in our logging and
                    // dump all metadata.
-                    tracing::info!(
-                        "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
-                        m.postgres_host,
-                        m.postgres_port,
-                        m.http_host,
-                        m.http_port,
-                        m.other
-                    );
+                    tracing::info!("Loaded node metadata: {m}");

                    let az_id = {
                        let az_id_from_metadata = m
@@ -195,6 +188,8 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                        node_id: conf.id,
                        listen_pg_addr: m.postgres_host,
                        listen_pg_port: m.postgres_port,
+                        listen_grpc_addr: m.grpc_host,
+                        listen_grpc_port: m.grpc_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
                        listen_https_port: m.https_port,
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -73,6 +73,7 @@ use crate::tenant::remote_timeline_client::{
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
+use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
@@ -1451,7 +1452,10 @@ async fn timeline_layer_scan_disposable_keys(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
        .with_scope_timeline(&timeline);

-    let guard = timeline.layers.read().await;
+    let guard = timeline
+        .layers
+        .read(LayerManagerLockHolder::GetLayerMapInfo)
+        .await;
    let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
        return Err(ApiError::NotFound(
            anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -4428,18 +4428,16 @@ pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
+pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
        "pageserver_basebackup_cache_entries_total",
        "Number of entries in the basebackup cache"
    )
    .expect("failed to define a metric")
 });

-// FIXME: Support basebackup cache size metrics.
-#[allow(dead_code)]
-pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
+pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
        "pageserver_basebackup_cache_size_bytes",
        "Total size of all basebackup cache entries on disk in bytes"
    )
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,7 +14,7 @@ use std::{io, str};

 use anyhow::{Context as _, anyhow, bail};
 use async_compression::tokio::write::GzipEncoder;
-use bytes::{Buf, BytesMut};
+use bytes::{Buf as _, BufMut as _, BytesMut};
 use futures::future::BoxFuture;
 use futures::{FutureExt, Stream};
 use itertools::Itertools;
@@ -25,12 +25,13 @@ use pageserver_api::config::{
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
-use pageserver_api::models::{
-    self, PageTraceEvent, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+use pageserver_api::models::{PageTraceEvent, TenantState};
+use pageserver_api::pagestream_api::{
+    self, PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetSlruSegmentRequest,
    PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, PagestreamNblocksResponse,
-    PagestreamProtocolVersion, PagestreamRequest, TenantState,
+    PagestreamProtocolVersion, PagestreamRequest,
 };
 use pageserver_api::reltag::SlruKind;
 use pageserver_api::shard::TenantShardId;
@@ -623,60 +624,6 @@ enum PageStreamError {
    BadRequest(Cow<'static, str>),
 }

-impl PageStreamError {
-    /// Converts a PageStreamError into a proto::GetPageResponse with the appropriate status
-    /// code, or a gRPC status if it should terminate the stream (e.g. shutdown). This is a
-    /// convenience method for use from a get_pages gRPC stream.
-    #[allow(clippy::result_large_err)]
-    fn into_get_page_response(
-        self,
-        request_id: page_api::RequestID,
-    ) -> Result<proto::GetPageResponse, tonic::Status> {
-        use page_api::GetPageStatusCode;
-        use tonic::Code;
-
-        // We dispatch to Into<tonic::Status> first, and then map it to a GetPageResponse.
-        let status: tonic::Status = self.into();
-        let status_code = match status.code() {
-            // We shouldn't see an OK status here, because we're emitting an error.
-            Code::Ok => {
-                debug_assert_ne!(status.code(), Code::Ok);
-                return Err(tonic::Status::internal(format!(
-                    "unexpected OK status: {status:?}",
-                )));
-            }
-
-            // These are per-request errors, returned as GetPageResponses.
-            Code::AlreadyExists => GetPageStatusCode::InvalidRequest,
-            Code::DataLoss => GetPageStatusCode::InternalError,
-            Code::FailedPrecondition => GetPageStatusCode::InvalidRequest,
-            Code::InvalidArgument => GetPageStatusCode::InvalidRequest,
-            Code::Internal => GetPageStatusCode::InternalError,
-            Code::NotFound => GetPageStatusCode::NotFound,
-            Code::OutOfRange => GetPageStatusCode::InvalidRequest,
-            Code::ResourceExhausted => GetPageStatusCode::SlowDown,
-
-            // These should terminate the stream.
-            Code::Aborted => return Err(status),
-            Code::Cancelled => return Err(status),
-            Code::DeadlineExceeded => return Err(status),
-            Code::PermissionDenied => return Err(status),
-            Code::Unauthenticated => return Err(status),
-            Code::Unavailable => return Err(status),
-            Code::Unimplemented => return Err(status),
-            Code::Unknown => return Err(status),
-        };
-
-        Ok(page_api::GetPageResponse {
-            request_id,
-            status_code,
-            reason: Some(status.message().to_string()),
-            page_images: Vec::new(),
-        }
-        .into())
-    }
-}
-
 impl From<PageStreamError> for tonic::Status {
    fn from(err: PageStreamError) -> Self {
        use tonic::Code;
@@ -766,7 +713,7 @@ struct BatchedGetPageRequest {

 #[cfg(feature = "testing")]
 struct BatchedTestRequest {
-    req: models::PagestreamTestRequest,
+    req: pagestream_api::PagestreamTestRequest,
    timer: SmgrOpTimer,
 }

@@ -780,13 +727,13 @@ enum BatchedFeMessage {
        span: Span,
        timer: SmgrOpTimer,
        shard: WeakHandle<TenantManagerTypes>,
-        req: models::PagestreamExistsRequest,
+        req: PagestreamExistsRequest,
    },
    Nblocks {
        span: Span,
        timer: SmgrOpTimer,
        shard: WeakHandle<TenantManagerTypes>,
-        req: models::PagestreamNblocksRequest,
+        req: PagestreamNblocksRequest,
    },
    GetPage {
        span: Span,
@@ -798,13 +745,13 @@ enum BatchedFeMessage {
        span: Span,
        timer: SmgrOpTimer,
        shard: WeakHandle<TenantManagerTypes>,
-        req: models::PagestreamDbSizeRequest,
+        req: PagestreamDbSizeRequest,
    },
    GetSlruSegment {
        span: Span,
        timer: SmgrOpTimer,
        shard: WeakHandle<TenantManagerTypes>,
-        req: models::PagestreamGetSlruSegmentRequest,
+        req: PagestreamGetSlruSegmentRequest,
    },
    #[cfg(feature = "testing")]
    Test {
@@ -2497,10 +2444,9 @@ impl PageServerHandler {
                .map(|(req, res)| {
                    res.map(|page| {
                        (
-                            PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
-                                req: req.req,
-                                page,
-                            }),
+                            PagestreamBeMessage::GetPage(
+                                pagestream_api::PagestreamGetPageResponse { req: req.req, page },
+                            ),
                            req.timer,
                            req.ctx,
                        )
@@ -2567,7 +2513,7 @@ impl PageServerHandler {
                .map(|(req, res)| {
                    res.map(|()| {
                        (
-                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
+                            PagestreamBeMessage::Test(pagestream_api::PagestreamTestResponse {
                                req: req.req.clone(),
                            }),
                            req.timer,
@@ -3438,8 +3384,8 @@ impl GrpcPageServiceHandler {

    /// Processes a GetPage batch request, via the GetPages bidirectional streaming RPC.
    ///
-    /// NB: errors will terminate the stream. Per-request errors should return a GetPageResponse
-    /// with an appropriate status code instead.
+    /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
+    /// GetPageResponse with an appropriate status code to avoid terminating the stream.
    ///
    /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
    /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
@@ -3456,7 +3402,7 @@ impl GrpcPageServiceHandler {
        let ctx = ctx.with_scope_page_service_pagestream(&timeline);

        // Validate the request, decorate the span, and convert it to a Pagestream request.
-        let req: page_api::GetPageRequest = req.try_into()?;
+        let req = page_api::GetPageRequest::try_from(req)?;

        span_record!(
            req_id = %req.request_id,
@@ -3467,7 +3413,7 @@ impl GrpcPageServiceHandler {
        );

        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
-        let effective_lsn = match PageServerHandler::effective_request_lsn(
+        let effective_lsn = PageServerHandler::effective_request_lsn(
            &timeline,
            timeline.get_last_record_lsn(),
            req.read_lsn.request_lsn,
@@ -3475,10 +3421,7 @@ impl GrpcPageServiceHandler {
                .not_modified_since_lsn
                .unwrap_or(req.read_lsn.request_lsn),
            &latest_gc_cutoff_lsn,
-        ) {
-            Ok(lsn) => lsn,
-            Err(err) => return err.into_get_page_response(req.request_id),
-        };
+        )?;

        let mut batch = SmallVec::with_capacity(req.block_numbers.len());
        for blkno in req.block_numbers {
@@ -3535,7 +3478,7 @@ impl GrpcPageServiceHandler {
                        "unexpected response: {resp:?}"
                    )));
                }
-                Err(err) => return err.err.into_get_page_response(req.request_id),
+                Err(err) => return Err(err.err.into()),
            };
        }

@@ -3601,44 +3544,43 @@ impl proto::PageService for GrpcPageServiceHandler {
        let timeline = self.get_request_timeline(&req).await?;
        let ctx = self.ctx.with_scope_timeline(&timeline);

-        // Validate the request, decorate the span, and wait for the LSN to arrive.
-        //
-        // TODO: this requires a read LSN, is that ok?
+        // Validate the request and decorate the span.
        Self::ensure_shard_zero(&timeline)?;
        if timeline.is_archived() == Some(true) {
            return Err(tonic::Status::failed_precondition("timeline is archived"));
        }
-        let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;
+        let req: page_api::GetBaseBackupRequest = req.into_inner().into();

-        span_record!(lsn=%req.read_lsn);
+        span_record!(lsn=?req.lsn);

-        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
-        timeline
-            .wait_lsn(
-                req.read_lsn.request_lsn,
-                WaitLsnWaiter::PageService,
-                WaitLsnTimeout::Default,
-                &ctx,
-            )
-            .await?;
-        timeline
-            .check_lsn_is_in_scope(req.read_lsn.request_lsn, &latest_gc_cutoff_lsn)
-            .map_err(|err| {
-                tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
-            })?;
+        // Wait for the LSN to arrive, if given.
+        if let Some(lsn) = req.lsn {
+            let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
+            timeline
+                .wait_lsn(
+                    lsn,
+                    WaitLsnWaiter::PageService,
+                    WaitLsnTimeout::Default,
+                    &ctx,
+                )
+                .await?;
+            timeline
+                .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
+                .map_err(|err| {
+                    tonic::Status::invalid_argument(format!("invalid basebackup LSN: {err}"))
+                })?;
+        }

        // Spawn a task to run the basebackup.
-        //
-        // TODO: do we need to support full base backups, for debugging?
        let span = Span::current();
        let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
        let jh = tokio::spawn(async move {
            let result = basebackup::send_basebackup_tarball(
                &mut simplex_write,
                &timeline,
-                Some(req.read_lsn.request_lsn),
+                req.lsn,
                None,
-                false,
+                req.full,
                req.replica,
                &ctx,
            )
@@ -3652,20 +3594,21 @@ impl proto::PageService for GrpcPageServiceHandler {

        // Emit chunks of size CHUNK_SIZE.
        let chunks = async_stream::try_stream! {
-            let mut chunk = BytesMut::with_capacity(CHUNK_SIZE);
            loop {
-                let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
-                    tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
-                })?;
-
-                // If we read 0 bytes, either the chunk is full or the stream is closed.
-                if n == 0 {
-                    if chunk.is_empty() {
-                        break;
+                let mut chunk = BytesMut::with_capacity(CHUNK_SIZE).limit(CHUNK_SIZE);
+                loop {
+                    let n = simplex_read.read_buf(&mut chunk).await.map_err(|err| {
+                        tonic::Status::internal(format!("failed to read basebackup chunk: {err}"))
+                    })?;
+                    if n == 0 {
+                        break; // full chunk or closed stream
                    }
-                    yield proto::GetBaseBackupResponseChunk::from(chunk.clone().freeze());
-                    chunk.clear();
                }
+                let chunk = chunk.into_inner().freeze();
+                if chunk.is_empty() {
+                    break;
+                }
+                yield proto::GetBaseBackupResponseChunk::from(chunk);
            }
            // Wait for the basebackup task to exit and check for errors.
            jh.await.map_err(|err| {
@@ -3742,9 +3685,16 @@ impl proto::PageService for GrpcPageServiceHandler {
                .await?
                .downgrade();
            while let Some(req) = reqs.message().await? {
-                yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
+                let req_id = req.request_id;
+                let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                    .instrument(span.clone()) // propagate request span
-                    .await?
+                    .await;
+                yield match result {
+                    Ok(resp) => resp,
+                    // Convert per-request errors to GetPageResponses as appropriate, or terminate
+                    // the stream with a tonic::Status.
+                    Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
+                }
            }
        };

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -51,6 +51,7 @@ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
 use storage_broker::BrokerClientChannel;
 use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
 use timeline::import_pgdata::ImportingTimeline;
+use timeline::layer_manager::LayerManagerLockHolder;
 use timeline::offload::{OffloadError, offload_timeline};
 use timeline::{
    CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata,
@@ -1315,7 +1316,7 @@ impl TenantShard {
                        ancestor.is_some()
                            || timeline
                                .layers
-                                .read()
+                                .read(LayerManagerLockHolder::LoadLayerMap)
                                .await
                                .layer_map()
                                .expect(
@@ -2643,7 +2644,7 @@ impl TenantShard {
        }
        let layer_names = tline
            .layers
-            .read()
+            .read(LayerManagerLockHolder::Testing)
            .await
            .layer_map()
            .unwrap()
@@ -3158,7 +3159,12 @@ impl TenantShard {

        for timeline in &compact {
            // Collect L0 counts. Can't await while holding lock above.
-            if let Ok(lm) = timeline.layers.read().await.layer_map() {
+            if let Ok(lm) = timeline
+                .layers
+                .read(LayerManagerLockHolder::Compaction)
+                .await
+                .layer_map()
+            {
                l0_counts.insert(timeline.timeline_id, lm.level0_deltas().len());
            }
        }
@@ -4900,7 +4906,7 @@ impl TenantShard {
        }
        let layer_names = tline
            .layers
-            .read()
+            .read(LayerManagerLockHolder::Testing)
            .await
            .layer_map()
            .unwrap()
@@ -6970,7 +6976,7 @@ mod tests {
            .await?;
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

-        let layer_map = tline.layers.read().await;
+        let layer_map = tline.layers.read(LayerManagerLockHolder::Testing).await;
        let level0_deltas = layer_map
            .layer_map()?
            .level0_deltas()
@@ -7206,7 +7212,7 @@ mod tests {
        let lsn = Lsn(0x10);
        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

-        let guard = tline.layers.read().await;
+        let guard = tline.layers.read(LayerManagerLockHolder::Testing).await;
        let lm = guard.layer_map()?;

        lm.dump(true, &ctx).await?;
@@ -8234,12 +8240,23 @@ mod tests {
            tline.freeze_and_flush().await?; // force create a delta layer
        }

-        let before_num_l0_delta_files =
-            tline.layers.read().await.layer_map()?.level0_deltas().len();
+        let before_num_l0_delta_files = tline
+            .layers
+            .read(LayerManagerLockHolder::Testing)
+            .await
+            .layer_map()?
+            .level0_deltas()
+            .len();

        tline.compact(&cancel, EnumSet::default(), &ctx).await?;

-        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();
+        let after_num_l0_delta_files = tline
+            .layers
+            .read(LayerManagerLockHolder::Testing)
+            .await
+            .layer_map()?
+            .level0_deltas()
+            .len();

        assert!(
            after_num_l0_delta_files < before_num_l0_delta_files,
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -61,8 +61,8 @@ pub(crate) struct LocationConf {
    /// The detailed shard identity.  This structure is already scoped within
    /// a TenantShardId, but we need the full ShardIdentity to enable calculating
    /// key->shard mappings.
+    // TODO(vlad): Remove this default once all configs have a shard identity on disk.
    #[serde(default = "ShardIdentity::unsharded")]
-    #[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
    pub(crate) shard: ShardIdentity,

    /// The pan-cluster tenant configuration, the same on all locations
@@ -149,7 +149,12 @@ impl LocationConf {
    /// For use when attaching/re-attaching: update the generation stored in this
    /// structure.  If we were in a secondary state, promote to attached (posession
    /// of a fresh generation implies this).
-    pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) {
+    pub(crate) fn attach_in_generation(
+        &mut self,
+        mode: AttachmentMode,
+        generation: Generation,
+        stripe_size: ShardStripeSize,
+    ) {
        match &mut self.mode {
            LocationMode::Attached(attach_conf) => {
                attach_conf.generation = generation;
@@ -163,6 +168,8 @@ impl LocationConf {
                })
            }
        }
+
+        self.shard.stripe_size = stripe_size;
    }

    pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -51,6 +51,7 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
+use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::{
    AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState,
 };
@@ -128,7 +129,7 @@ pub(crate) enum ShardSelector {
 ///
 /// This represents the subset of a LocationConfig that we receive during re-attach.
 pub(crate) enum TenantStartupMode {
-    Attached((AttachmentMode, Generation)),
+    Attached((AttachmentMode, Generation, ShardStripeSize)),
    Secondary,
 }

@@ -142,15 +143,21 @@ impl TenantStartupMode {
        match (rart.mode, rart.r#gen) {
            (LocationConfigMode::Detached, _) => None,
            (LocationConfigMode::Secondary, _) => Some(Self::Secondary),
-            (LocationConfigMode::AttachedMulti, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Multi, Generation::new(g))))
-            }
-            (LocationConfigMode::AttachedSingle, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Single, Generation::new(g))))
-            }
-            (LocationConfigMode::AttachedStale, Some(g)) => {
-                Some(Self::Attached((AttachmentMode::Stale, Generation::new(g))))
-            }
+            (LocationConfigMode::AttachedMulti, Some(g)) => Some(Self::Attached((
+                AttachmentMode::Multi,
+                Generation::new(g),
+                rart.stripe_size,
+            ))),
+            (LocationConfigMode::AttachedSingle, Some(g)) => Some(Self::Attached((
+                AttachmentMode::Single,
+                Generation::new(g),
+                rart.stripe_size,
+            ))),
+            (LocationConfigMode::AttachedStale, Some(g)) => Some(Self::Attached((
+                AttachmentMode::Stale,
+                Generation::new(g),
+                rart.stripe_size,
+            ))),
            _ => {
                tracing::warn!(
                    "Received invalid re-attach state for tenant {}: {rart:?}",
@@ -318,9 +325,11 @@ fn emergency_generations(
            Some((
                *tid,
                match &lc.mode {
-                    LocationMode::Attached(alc) => {
-                        TenantStartupMode::Attached((alc.attach_mode, alc.generation))
-                    }
+                    LocationMode::Attached(alc) => TenantStartupMode::Attached((
+                        alc.attach_mode,
+                        alc.generation,
+                        ShardStripeSize::default(),
+                    )),
                    LocationMode::Secondary(_) => TenantStartupMode::Secondary,
                },
            ))
@@ -364,7 +373,7 @@ async fn init_load_generations(
        .iter()
        .flat_map(|(id, start_mode)| {
            match start_mode {
-                TenantStartupMode::Attached((_mode, generation)) => Some(generation),
+                TenantStartupMode::Attached((_mode, generation, _stripe_size)) => Some(generation),
                TenantStartupMode::Secondary => None,
            }
            .map(|gen_| (*id, *gen_))
@@ -584,7 +593,7 @@ pub async fn init_tenant_mgr(
                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
                    }
                }
-                Some(TenantStartupMode::Attached((attach_mode, generation))) => {
+                Some(TenantStartupMode::Attached((attach_mode, generation, stripe_size))) => {
                    let old_gen_higher = match &location_conf.mode {
                        LocationMode::Attached(AttachedLocationConfig {
                            generation: old_generation,
@@ -608,7 +617,7 @@ pub async fn init_tenant_mgr(
                        // local disk content: demote to secondary rather than detaching.
                        location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF);
                    } else {
-                        location_conf.attach_in_generation(*attach_mode, *generation);
+                        location_conf.attach_in_generation(*attach_mode, *generation, *stripe_size);
                    }
                }
            }
@@ -1658,7 +1667,10 @@ impl TenantManager {
            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
            for timeline in timelines.values() {
                tracing::info!(timeline_id=%timeline.timeline_id, "Loading list of layers to hardlink");
-                let layers = timeline.layers.read().await;
+                let layers = timeline
+                    .layers
+                    .read(LayerManagerLockHolder::GetLayerMapInfo)
+                    .await;

                for layer in layers.likely_resident_layers() {
                    let relative_path = layer
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,6 +1,7 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use std::io::{ErrorKind, SeekFrom};
+use std::num::NonZeroU32;
 use std::time::SystemTime;

 use anyhow::{Context, bail};
@@ -228,11 +229,25 @@ pub(crate) async fn time_travel_recover_tenant(
        let timelines_path = super::remote_timelines_path(tenant_shard_id);
        prefixes.push(timelines_path);
    }
+
+    // Limit the number of versions deletions, mostly so that we don't
+    // keep requesting forever if the list is too long, as we'd put the
+    // list in RAM.
+    // Building a list of 100k entries that reaches the limit roughly takes
+    // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
+    const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);
+
    for prefix in &prefixes {
        backoff::retry(
            || async {
                storage
-                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
+                    .time_travel_recover(
+                        Some(prefix),
+                        timestamp,
+                        done_if_after,
+                        cancel,
+                        COMPLEXITY_LIMIT,
+                    )
                    .await
            },
            |e| !matches!(e, TimeTravelError::Other(_)),
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1635,6 +1635,7 @@ pub(crate) mod test {
    use crate::tenant::disk_btree::tests::TestDisk;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
+    use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
    use crate::tenant::{TenantShard, Timeline};

    /// Construct an index for a fictional delta layer and and then
@@ -2002,7 +2003,7 @@ pub(crate) mod test {

        let initdb_layer = timeline
            .layers
-            .read()
+            .read(crate::tenant::timeline::layer_manager::LayerManagerLockHolder::Testing)
            .await
            .likely_resident_layers()
            .next()
@@ -2078,7 +2079,7 @@ pub(crate) mod test {

        let new_layer = timeline
            .layers
-            .read()
+            .read(LayerManagerLockHolder::Testing)
            .await
            .likely_resident_layers()
            .find(|&x| x != &initdb_layer)
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -10,6 +10,7 @@ use super::*;
 use crate::context::DownloadBehavior;
 use crate::tenant::harness::{TenantHarness, test_img};
 use crate::tenant::storage_layer::{IoConcurrency, LayerVisibilityHint};
+use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;

 /// Used in tests to advance a future to wanted await point, and not futher.
 const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
@@ -59,7 +60,7 @@ async fn smoke_test() {
    // there to avoid the timeline being illegally empty
    let (layer, dummy_layer) = {
        let mut layers = {
-            let layers = timeline.layers.read().await;
+            let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

@@ -215,7 +216,7 @@ async fn smoke_test() {

    // Simulate GC removing our test layer.
    {
-        let mut g = timeline.layers.write().await;
+        let mut g = timeline.layers.write(LayerManagerLockHolder::Testing).await;

        let layers = &[layer];
        g.open_mut().unwrap().finish_gc_timeline(layers);
@@ -261,7 +262,7 @@ async fn evict_and_wait_on_wanted_deleted() {

    let layer = {
        let mut layers = {
-            let layers = timeline.layers.read().await;
+            let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

@@ -305,7 +306,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    // assert that once we remove the `layer` from the layer map and drop our reference,
    // the deletion of the layer in remote_storage happens.
    {
-        let mut layers = timeline.layers.write().await;
+        let mut layers = timeline.layers.write(LayerManagerLockHolder::Testing).await;
        layers.open_mut().unwrap().finish_gc_timeline(&[layer]);
    }

@@ -347,7 +348,7 @@ fn read_wins_pending_eviction() {

        let layer = {
            let mut layers = {
-                let layers = timeline.layers.read().await;
+                let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
            };

@@ -480,7 +481,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {

        let layer = {
            let mut layers = {
-                let layers = timeline.layers.read().await;
+                let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
                layers.likely_resident_layers().cloned().collect::<Vec<_>>()
            };

@@ -655,7 +656,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {

    let layer = {
        let mut layers = {
-            let layers = timeline.layers.read().await;
+            let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

@@ -741,7 +742,7 @@ async fn evict_and_wait_does_not_wait_for_download() {

    let layer = {
        let mut layers = {
-            let layers = timeline.layers.read().await;
+            let layers = timeline.layers.read(LayerManagerLockHolder::Testing).await;
            layers.likely_resident_layers().cloned().collect::<Vec<_>>()
        };

@@ -862,7 +863,7 @@ async fn eviction_cancellation_on_drop() {

    let (evicted_layer, not_evicted) = {
        let mut layers = {
-            let mut guard = timeline.layers.write().await;
+            let mut guard = timeline.layers.write(LayerManagerLockHolder::Testing).await;
            let layers = guard.likely_resident_layers().cloned().collect::<Vec<_>>();
            // remove the layers from layermap
            guard.open_mut().unwrap().finish_gc_timeline(&layers);
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -35,7 +35,11 @@ use fail::fail_point;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
 use handle::ShardTimelineId;
-use layer_manager::Shutdown;
+use layer_manager::{
+    LayerManagerLockHolder, LayerManagerReadGuard, LayerManagerWriteGuard, LockedLayerManager,
+    Shutdown,
+};
+
 use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
@@ -82,7 +86,6 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
-use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 use super::remote_timeline_client::RemoteTimelineClient;
@@ -181,13 +184,13 @@ impl std::fmt::Display for ImageLayerCreationMode {

 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
+fn drop_layer_manager_rlock(rlock: LayerManagerReadGuard<'_>) {
    drop(rlock)
 }

 /// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
 /// Can be removed after all refactors are done.
-fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
+fn drop_layer_manager_wlock(rlock: LayerManagerWriteGuard<'_>) {
    drop(rlock)
 }

@@ -241,7 +244,7 @@ pub struct Timeline {
    ///
    /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
-    pub(crate) layers: tokio::sync::RwLock<LayerManager>,
+    pub(crate) layers: LockedLayerManager,

    last_freeze_at: AtomicLsn,
    // Atomic would be more appropriate here.
@@ -1535,7 +1538,10 @@ impl Timeline {
    /// This method makes no distinction between local and remote layers.
    /// Hence, the result **does not represent local filesystem usage**.
    pub(crate) async fn layer_size_sum(&self) -> u64 {
-        let guard = self.layers.read().await;
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::GetLayerMapInfo)
+            .await;
        guard.layer_size_sum()
    }

@@ -1845,7 +1851,7 @@ impl Timeline {
        // time, and this was missed.
        // if write_guard.is_none() { return; }

-        let Ok(layers_guard) = self.layers.try_read() else {
+        let Ok(layers_guard) = self.layers.try_read(LayerManagerLockHolder::TryFreezeLayer) else {
            // Don't block if the layer lock is busy
            return;
        };
@@ -2158,7 +2164,7 @@ impl Timeline {
        if let ShutdownMode::FreezeAndFlush = mode {
            let do_flush = if let Some((open, frozen)) = self
                .layers
-                .read()
+                .read(LayerManagerLockHolder::Shutdown)
                .await
                .layer_map()
                .map(|lm| (lm.open_layer.is_some(), lm.frozen_layers.len()))
@@ -2262,7 +2268,10 @@ impl Timeline {
            // Allow any remaining in-memory layers to do cleanup -- until that, they hold the gate
            // open.
            let mut write_guard = self.write_lock.lock().await;
-            self.layers.write().await.shutdown(&mut write_guard);
+            self.layers
+                .write(LayerManagerLockHolder::Shutdown)
+                .await
+                .shutdown(&mut write_guard);
        }

        // Finally wait until any gate-holders are complete.
@@ -2365,7 +2374,10 @@ impl Timeline {
        &self,
        reset: LayerAccessStatsReset,
    ) -> Result<LayerMapInfo, layer_manager::Shutdown> {
-        let guard = self.layers.read().await;
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::GetLayerMapInfo)
+            .await;
        let layer_map = guard.layer_map()?;
        let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
        if let Some(open_layer) = &layer_map.open_layer {
@@ -3232,7 +3244,7 @@ impl Timeline {

    /// Initialize with an empty layer map. Used when creating a new timeline.
    pub(super) fn init_empty_layer_map(&self, start_lsn: Lsn) {
-        let mut layers = self.layers.try_write().expect(
+        let mut layers = self.layers.try_write(LayerManagerLockHolder::Init).expect(
            "in the context where we call this function, no other task has access to the object",
        );
        layers
@@ -3252,7 +3264,10 @@ impl Timeline {
        use init::Decision::*;
        use init::{Discovered, DismissedLayer};

-        let mut guard = self.layers.write().await;
+        let mut guard = self
+            .layers
+            .write(LayerManagerLockHolder::LoadLayerMap)
+            .await;

        let timer = self.metrics.load_layer_map_histo.start_timer();

@@ -3869,7 +3884,10 @@ impl Timeline {
        &self,
        layer_name: &LayerName,
    ) -> Result<Option<Layer>, layer_manager::Shutdown> {
-        let guard = self.layers.read().await;
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::GetLayerMapInfo)
+            .await;
        let layer = guard
            .layer_map()?
            .iter_historic_layers()
@@ -3902,7 +3920,10 @@ impl Timeline {
            return None;
        }

-        let guard = self.layers.read().await;
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::GenerateHeatmap)
+            .await;

        // Firstly, if there's any heatmap left over from when this location
        // was a secondary, take that into account. Keep layers that are:
@@ -4000,7 +4021,10 @@ impl Timeline {
    }

    pub(super) async fn generate_unarchival_heatmap(&self, end_lsn: Lsn) -> PreviousHeatmap {
-        let guard = self.layers.read().await;
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::GenerateHeatmap)
+            .await;

        let now = SystemTime::now();
        let mut heatmap_layers = Vec::default();
@@ -4342,7 +4366,7 @@ impl Timeline {
        query: &VersionedKeySpaceQuery,
    ) -> Result<LayerFringe, GetVectoredError> {
        let mut fringe = LayerFringe::new();
-        let guard = self.layers.read().await;
+        let guard = self.layers.read(LayerManagerLockHolder::GetPage).await;

        match query {
            VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
@@ -4445,7 +4469,7 @@ impl Timeline {
            // required for correctness, but avoids visiting extra layers
            // which turns out to be a perf bottleneck in some cases.
            if !unmapped_keyspace.is_empty() {
-                let guard = timeline.layers.read().await;
+                let guard = timeline.layers.read(LayerManagerLockHolder::GetPage).await;
                guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;

                // It's safe to drop the layer map lock after planning the next round of reads.
@@ -4555,7 +4579,10 @@ impl Timeline {
        _guard: &tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<InMemoryLayer>> {
-        let mut guard = self.layers.write().await;
+        let mut guard = self
+            .layers
+            .write(LayerManagerLockHolder::GetLayerForWrite)
+            .await;

        let last_record_lsn = self.get_last_record_lsn();
        ensure!(
@@ -4597,7 +4624,10 @@ impl Timeline {
        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
    ) -> Result<u64, FlushLayerError> {
        let frozen = {
-            let mut guard = self.layers.write().await;
+            let mut guard = self
+                .layers
+                .write(LayerManagerLockHolder::TryFreezeLayer)
+                .await;
            guard
                .open_mut()?
                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock, &self.metrics)
@@ -4638,7 +4668,12 @@ impl Timeline {
        ctx: &RequestContext,
    ) {
        // Subscribe to L0 delta layer updates, for compaction backpressure.
-        let mut watch_l0 = match self.layers.read().await.layer_map() {
+        let mut watch_l0 = match self
+            .layers
+            .read(LayerManagerLockHolder::FlushLoop)
+            .await
+            .layer_map()
+        {
            Ok(lm) => lm.watch_level0_deltas(),
            Err(Shutdown) => return,
        };
@@ -4675,7 +4710,7 @@ impl Timeline {

                // Fetch the next layer to flush, if any.
                let (layer, l0_count, frozen_count, frozen_size) = {
-                    let layers = self.layers.read().await;
+                    let layers = self.layers.read(LayerManagerLockHolder::FlushLoop).await;
                    let Ok(lm) = layers.layer_map() else {
                        info!("dropping out of flush loop for timeline shutdown");
                        return;
@@ -4971,7 +5006,10 @@ impl Timeline {
        // in-memory layer from the map now. The flushed layer is stored in
        // the mapping in `create_delta_layer`.
        {
-            let mut guard = self.layers.write().await;
+            let mut guard = self
+                .layers
+                .write(LayerManagerLockHolder::FlushFrozenLayer)
+                .await;

            guard.open_mut()?.finish_flush_l0_layer(
                delta_layer_to_add.as_ref(),
@@ -5186,7 +5224,7 @@ impl Timeline {
    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
        let threshold = self.get_image_creation_threshold();

-        let guard = self.layers.read().await;
+        let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
        let Ok(layers) = guard.layer_map() else {
            return false;
        };
@@ -5604,7 +5642,7 @@ impl Timeline {
            if let ImageLayerCreationMode::Force = mode {
                // When forced to create image layers, we might try and create them where they already
                // exist.  This mode is only used in tests/debug.
-                let layers = self.layers.read().await;
+                let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
                if layers.contains_key(&PersistentLayerKey {
                    key_range: img_range.clone(),
                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
@@ -5729,7 +5767,7 @@ impl Timeline {

        let image_layers = batch_image_writer.finish(self, ctx).await?;

-        let mut guard = self.layers.write().await;
+        let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;

        // FIXME: we could add the images to be uploaded *before* returning from here, but right
        // now they are being scheduled outside of write lock; current way is inconsistent with
@@ -5737,7 +5775,7 @@ impl Timeline {
        guard
            .open_mut()?
            .track_new_image_layers(&image_layers, &self.metrics);
-        drop_wlock(guard);
+        drop_layer_manager_wlock(guard);
        let duration = timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
@@ -6107,7 +6145,7 @@ impl Timeline {
        layers_to_remove: &[Layer],
    ) -> Result<(), CompactionError> {
        let mut guard = tokio::select! {
-            guard = self.layers.write() => guard,
+            guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
            _ = self.cancel.cancelled() => {
                return Err(CompactionError::ShuttingDown);
            }
@@ -6156,7 +6194,7 @@ impl Timeline {
        self.remote_client
            .schedule_compaction_update(&remove_layers, new_deltas)?;

-        drop_wlock(guard);
+        drop_layer_manager_wlock(guard);

        Ok(())
    }
@@ -6166,7 +6204,7 @@ impl Timeline {
        mut replace_layers: Vec<(Layer, ResidentLayer)>,
        mut drop_layers: Vec<Layer>,
    ) -> Result<(), CompactionError> {
-        let mut guard = self.layers.write().await;
+        let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;

        // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want
        // to avoid double-removing, and avoid rewriting something that was removed.
@@ -6517,7 +6555,10 @@ impl Timeline {
        // 5. newer on-disk image layers cover the layer's whole key range
        //
        // TODO holding a write lock is too agressive and avoidable
-        let mut guard = self.layers.write().await;
+        let mut guard = self
+            .layers
+            .write(LayerManagerLockHolder::GarbageCollection)
+            .await;
        let layers = guard.layer_map()?;
        'outer: for l in layers.iter_historic_layers() {
            result.layers_total += 1;
@@ -6819,7 +6860,10 @@ impl Timeline {
        use pageserver_api::models::DownloadRemoteLayersTaskState;

        let remaining = {
-            let guard = self.layers.read().await;
+            let guard = self
+                .layers
+                .read(LayerManagerLockHolder::GetLayerMapInfo)
+                .await;
            let Ok(lm) = guard.layer_map() else {
                // technically here we could look into iterating accessible layers, but downloading
                // all layers of a shutdown timeline makes no sense regardless.
@@ -6925,7 +6969,7 @@ impl Timeline {
 impl Timeline {
    /// Returns non-remote layers for eviction.
    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
-        let guard = self.layers.read().await;
+        let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;
        let mut max_layer_size: Option<u64> = None;

        let resident_layers = guard
@@ -7026,7 +7070,7 @@ impl Timeline {
        let image_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
        info!("force created image layer {}", image_layer.local_path());
        {
-            let mut guard = self.layers.write().await;
+            let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
            guard
                .open_mut()
                .unwrap()
@@ -7089,7 +7133,7 @@ impl Timeline {
        let delta_layer = Layer::finish_creating(self.conf, self, desc, &path)?;
        info!("force created delta layer {}", delta_layer.local_path());
        {
-            let mut guard = self.layers.write().await;
+            let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
            guard
                .open_mut()
                .unwrap()
@@ -7184,7 +7228,7 @@ impl Timeline {

        // Link the layer to the layer map
        {
-            let mut guard = self.layers.write().await;
+            let mut guard = self.layers.write(LayerManagerLockHolder::Testing).await;
            let layer_map = guard.open_mut().unwrap();
            layer_map.force_insert_in_memory_layer(Arc::new(layer));
        }
@@ -7201,7 +7245,7 @@ impl Timeline {
        io_concurrency: IoConcurrency,
    ) -> anyhow::Result<Vec<(Key, Bytes)>> {
        let mut all_data = Vec::new();
-        let guard = self.layers.read().await;
+        let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
        for layer in guard.layer_map()?.iter_historic_layers() {
            if !layer.is_delta() && layer.image_layer_lsn() == lsn {
                let layer = guard.get_from_desc(&layer);
@@ -7230,7 +7274,7 @@ impl Timeline {
        self: &Arc<Timeline>,
    ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
        let mut layers = Vec::new();
-        let guard = self.layers.read().await;
+        let guard = self.layers.read(LayerManagerLockHolder::Testing).await;
        for layer in guard.layer_map()?.iter_historic_layers() {
            layers.push(layer.key());
        }
@@ -7342,7 +7386,7 @@ impl TimelineWriter<'_> {
        let l0_count = self
            .tl
            .layers
-            .read()
+            .read(LayerManagerLockHolder::GetLayerMapInfo)
            .await
            .layer_map()?
            .level0_deltas()
@@ -7561,6 +7605,7 @@ mod tests {
    use crate::tenant::harness::{TenantHarness, test_img};
    use crate::tenant::layer_map::LayerMap;
    use crate::tenant::storage_layer::{Layer, LayerName, LayerVisibilityHint};
+    use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
    use crate::tenant::timeline::{DeltaLayerTestDesc, EvictionError};
    use crate::tenant::{PreviousHeatmap, Timeline};

@@ -7668,7 +7713,7 @@ mod tests {
        // Evict all the layers and stash the old heatmap in the timeline.
        // This simulates a migration to a cold secondary location.

-        let guard = timeline.layers.read().await;
+        let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
        let mut all_layers = Vec::new();
        let forever = std::time::Duration::from_secs(120);
        for layer in guard.likely_resident_layers() {
@@ -7790,7 +7835,7 @@ mod tests {
            })));

        // Evict all the layers in the previous heatmap
-        let guard = timeline.layers.read().await;
+        let guard = timeline.layers.read(LayerManagerLockHolder::Testing).await;
        let forever = std::time::Duration::from_secs(120);
        for layer in guard.likely_resident_layers() {
            layer.evict_and_wait(forever).await.unwrap();
@@ -7853,7 +7898,10 @@ mod tests {
    }

    async fn find_some_layer(timeline: &Timeline) -> Layer {
-        let layers = timeline.layers.read().await;
+        let layers = timeline
+            .layers
+            .read(LayerManagerLockHolder::GetLayerMapInfo)
+            .await;
        let desc = layers
            .layer_map()
            .unwrap()
--- a/pageserver/src/tenant/timeline/analysis.rs
+++ b/pageserver/src/tenant/timeline/analysis.rs
@@ -4,6 +4,7 @@ use std::ops::Range;
 use utils::lsn::Lsn;

 use super::Timeline;
+use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;

 #[derive(serde::Serialize)]
 pub(crate) struct RangeAnalysis {
@@ -24,7 +25,10 @@ impl Timeline {

        let num_of_l0;
        let all_layer_files = {
-            let guard = self.layers.read().await;
+            let guard = self
+                .layers
+                .read(LayerManagerLockHolder::GetLayerMapInfo)
+                .await;
            num_of_l0 = guard.layer_map().unwrap().level0_deltas().len();
            guard.all_persistent_layers()
        };
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -9,7 +9,7 @@ use std::ops::{Deref, Range};
 use std::sync::Arc;
 use std::time::{Duration, Instant};

-use super::layer_manager::LayerManager;
+use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard};
 use super::{
    CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder,
    GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration,
@@ -62,7 +62,7 @@ use crate::tenant::storage_layer::{
 use crate::tenant::tasks::log_compaction_error;
 use crate::tenant::timeline::{
    DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
-    ResidentLayer, drop_rlock,
+    ResidentLayer, drop_layer_manager_rlock,
 };
 use crate::tenant::{DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -314,7 +314,10 @@ impl GcCompactionQueue {
            .unwrap_or(Lsn::INVALID);

        let layers = {
-            let guard = timeline.layers.read().await;
+            let guard = timeline
+                .layers
+                .read(LayerManagerLockHolder::GetLayerMapInfo)
+                .await;
            let layer_map = guard.layer_map()?;
            layer_map.iter_historic_layers().collect_vec()
        };
@@ -408,7 +411,10 @@ impl GcCompactionQueue {
        timeline: &Arc<Timeline>,
        lsn: Lsn,
    ) -> Result<u64, CompactionError> {
-        let guard = timeline.layers.read().await;
+        let guard = timeline
+            .layers
+            .read(LayerManagerLockHolder::GetLayerMapInfo)
+            .await;
        let layer_map = guard.layer_map()?;
        let layers = layer_map.iter_historic_layers().collect_vec();
        let mut size = 0;
@@ -851,7 +857,7 @@ impl KeyHistoryRetention {
        }
        let layer_generation;
        {
-            let guard = tline.layers.read().await;
+            let guard = tline.layers.read(LayerManagerLockHolder::Compaction).await;
            if !guard.contains_key(key) {
                return false;
            }
@@ -1282,7 +1288,10 @@ impl Timeline {
            // We do the repartition on the L0-L1 boundary. All data below the boundary
            // are compacted by L0 with low read amplification, thus making the `repartition`
            // function run fast.
-            let guard = self.layers.read().await;
+            let guard = self
+                .layers
+                .read(LayerManagerLockHolder::GetLayerMapInfo)
+                .await;
            guard
                .all_persistent_layers()
                .iter()
@@ -1461,7 +1470,7 @@ impl Timeline {
        let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;

-        let layers = self.layers.read().await;
+        let layers = self.layers.read(LayerManagerLockHolder::Compaction).await;
        let layers_iter = layers.layer_map()?.iter_historic_layers();
        let (layers_total, mut layers_checked) = (layers_iter.len(), 0);
        for layer_desc in layers_iter {
@@ -1722,7 +1731,10 @@ impl Timeline {
        // are implicitly left visible, because LayerVisibilityHint's default is Visible, and we never modify it here.
        // Note that L0 deltas _can_ be covered by image layers, but we consider them 'visible' because we anticipate that
        // they will be subject to L0->L1 compaction in the near future.
-        let layer_manager = self.layers.read().await;
+        let layer_manager = self
+            .layers
+            .read(LayerManagerLockHolder::GetLayerMapInfo)
+            .await;
        let layer_map = layer_manager.layer_map()?;

        let readable_points = {
@@ -1775,7 +1787,7 @@ impl Timeline {
            };

            let begin = tokio::time::Instant::now();
-            let phase1_layers_locked = self.layers.read().await;
+            let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await;
            let now = tokio::time::Instant::now();
            stats.read_lock_acquisition_micros =
                DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -1803,7 +1815,7 @@ impl Timeline {
    /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
    async fn compact_level0_phase1<'a>(
        self: &'a Arc<Self>,
-        guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
+        guard: LayerManagerReadGuard<'a>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
        force_compaction_ignore_threshold: bool,
@@ -2029,7 +2041,7 @@ impl Timeline {
            holes
        };
        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
-        drop_rlock(guard);
+        drop_layer_manager_rlock(guard);

        if self.cancel.is_cancelled() {
            return Err(CompactionError::ShuttingDown);
@@ -2469,7 +2481,7 @@ impl Timeline {

        // Find the top of the historical layers
        let end_lsn = {
-            let guard = self.layers.read().await;
+            let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
            let layers = guard.layer_map()?;

            let l0_deltas = layers.level0_deltas();
@@ -3008,7 +3020,7 @@ impl Timeline {
        }
        split_key_ranges.sort();
        let all_layers = {
-            let guard = self.layers.read().await;
+            let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
            let layer_map = guard.layer_map()?;
            layer_map.iter_historic_layers().collect_vec()
        };
@@ -3112,12 +3124,12 @@ impl Timeline {
                .await?;
            let jobs_len = jobs.len();
            for (idx, job) in jobs.into_iter().enumerate() {
-                info!(
-                    "running enhanced gc bottom-most compaction, sub-compaction {}/{}",
-                    idx + 1,
-                    jobs_len
-                );
+                let sub_compaction_progress = format!("{}/{}", idx + 1, jobs_len);
                self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
+                    .instrument(info_span!(
+                        "sub_compaction",
+                        sub_compaction_progress = sub_compaction_progress
+                    ))
                    .await?;
            }
            if jobs_len == 0 {
@@ -3185,7 +3197,10 @@ impl Timeline {
        // 1. If a layer is in the selection, all layers below it are in the selection.
        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
        let job_desc = {
-            let guard = self.layers.read().await;
+            let guard = self
+                .layers
+                .read(LayerManagerLockHolder::GarbageCollection)
+                .await;
            let layers = guard.layer_map()?;
            let gc_info = self.gc_info.read().unwrap();
            let mut retain_lsns_below_horizon = Vec::new();
@@ -3956,7 +3971,10 @@ impl Timeline {

        // First, do a sanity check to ensure the newly-created layer map does not contain overlaps.
        let all_layers = {
-            let guard = self.layers.read().await;
+            let guard = self
+                .layers
+                .read(LayerManagerLockHolder::GarbageCollection)
+                .await;
            let layer_map = guard.layer_map()?;
            layer_map.iter_historic_layers().collect_vec()
        };
@@ -4020,7 +4038,10 @@ impl Timeline {
            let update_guard = self.gc_compaction_layer_update_lock.write().await;
            // Acquiring the update guard ensures current read operations end and new read operations are blocked.
            // TODO: can we use `latest_gc_cutoff` Rcu to achieve the same effect?
-            let mut guard = self.layers.write().await;
+            let mut guard = self
+                .layers
+                .write(LayerManagerLockHolder::GarbageCollection)
+                .await;
            guard
                .open_mut()?
                .finish_gc_compaction(&layer_selection, &compact_to, &self.metrics);
@@ -4088,7 +4109,11 @@ impl TimelineAdaptor {

    pub async fn flush_updates(&mut self) -> Result<(), CompactionError> {
        let layers_to_delete = {
-            let guard = self.timeline.layers.read().await;
+            let guard = self
+                .timeline
+                .layers
+                .read(LayerManagerLockHolder::Compaction)
+                .await;
            self.layers_to_delete
                .iter()
                .map(|x| guard.get_from_desc(x))
@@ -4133,7 +4158,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
    ) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
        self.flush_updates().await?;

-        let guard = self.timeline.layers.read().await;
+        let guard = self
+            .timeline
+            .layers
+            .read(LayerManagerLockHolder::Compaction)
+            .await;
        let layer_map = guard.layer_map()?;

        let result = layer_map
@@ -4172,7 +4201,11 @@ impl CompactionJobExecutor for TimelineAdaptor {
        // this is a lot more complex than a simple downcast...
        if layer.is_delta() {
            let l = {
-                let guard = self.timeline.layers.read().await;
+                let guard = self
+                    .timeline
+                    .layers
+                    .read(LayerManagerLockHolder::Compaction)
+                    .await;
                guard.get_from_desc(layer)
            };
            let result = l.download_and_keep_resident(ctx).await?;
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -19,7 +19,7 @@ use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::sync::gate::GateError;

-use super::layer_manager::LayerManager;
+use super::layer_manager::{LayerManager, LayerManagerLockHolder};
 use super::{FlushLayerError, Timeline};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::TaskKind;
@@ -199,7 +199,10 @@ pub(crate) async fn generate_tombstone_image_layer(
    let image_lsn = ancestor_lsn;

    {
-        let layers = detached.layers.read().await;
+        let layers = detached
+            .layers
+            .read(LayerManagerLockHolder::DetachAncestor)
+            .await;
        for layer in layers.all_persistent_layers() {
            if !layer.is_delta
                && layer.lsn_range.start == image_lsn
@@ -423,7 +426,7 @@ pub(super) async fn prepare(
        // we do not need to start from our layers, because they can only be layers that come
        // *after* ancestor_lsn
        let layers = tokio::select! {
-            guard = ancestor.layers.read() => guard,
+            guard = ancestor.layers.read(LayerManagerLockHolder::DetachAncestor) => guard,
            _ = detached.cancel.cancelled() => {
                return Err(ShuttingDown);
            }
@@ -869,7 +872,12 @@ async fn remote_copy(

                // Double check that the file is orphan (probably from an earlier attempt), then delete it
                let key = file_name.clone().into();
-                if adoptee.layers.read().await.contains_key(&key) {
+                if adoptee
+                    .layers
+                    .read(LayerManagerLockHolder::DetachAncestor)
+                    .await
+                    .contains_key(&key)
+                {
                    // We are supposed to filter out such cases before coming to this function
                    return Err(Error::Prepare(anyhow::anyhow!(
                        "layer file {file_name} already present and inside layer map"
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -33,6 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::storage_layer::LayerVisibilityHint;
 use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
 use crate::tenant::timeline::EvictionError;
+use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 #[derive(Default)]
@@ -208,7 +209,7 @@ impl Timeline {

        let mut js = tokio::task::JoinSet::new();
        {
-            let guard = self.layers.read().await;
+            let guard = self.layers.read(LayerManagerLockHolder::Eviction).await;

            guard
                .likely_resident_layers()
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -15,6 +15,7 @@ use super::{Timeline, TimelineDeleteProgress};
 use crate::context::RequestContext;
 use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
 use crate::tenant::metadata::TimelineMetadata;
+use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;

 mod flow;
 mod importbucket_client;
@@ -163,7 +164,10 @@ async fn prepare_import(
    info!("wipe the slate clean");
    {
        // TODO: do we need to hold GC lock for this?
-        let mut guard = timeline.layers.write().await;
+        let mut guard = timeline
+            .layers
+            .write(LayerManagerLockHolder::ImportPgData)
+            .await;
        assert!(
            guard.layer_map()?.open_layer.is_none(),
            "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -56,6 +56,7 @@ use crate::pgdatadir_mapping::{
 };
 use crate::task_mgr::TaskKind;
 use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer};
+use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;

 pub async fn run(
    timeline: Arc<Timeline>,
@@ -984,7 +985,10 @@ impl ChunkProcessingJob {
            let (desc, path) = writer.finish(ctx).await?;

            {
-                let guard = timeline.layers.read().await;
+                let guard = timeline
+                    .layers
+                    .read(LayerManagerLockHolder::ImportPgData)
+                    .await;
                let existing_layer = guard.try_get_from_key(&desc.key());
                if let Some(layer) = existing_layer {
                    if layer.metadata().generation == timeline.generation {
@@ -1007,7 +1011,10 @@ impl ChunkProcessingJob {
        // certain that the existing layer is identical to the new one, so in that case
        // we replace the old layer with the one we just generated.

-        let mut guard = timeline.layers.write().await;
+        let mut guard = timeline
+            .layers
+            .write(LayerManagerLockHolder::ImportPgData)
+            .await;

        let existing_layer = guard
            .try_get_from_key(&resident_layer.layer_desc().key())
@@ -1036,7 +1043,7 @@ impl ChunkProcessingJob {
            }
        }

-        crate::tenant::timeline::drop_wlock(guard);
+        crate::tenant::timeline::drop_layer_manager_wlock(guard);

        timeline
            .remote_client
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,5 +1,8 @@
 use std::collections::HashMap;
+use std::mem::ManuallyDrop;
+use std::ops::{Deref, DerefMut};
 use std::sync::Arc;
+use std::time::Duration;

 use anyhow::{Context, bail, ensure};
 use itertools::Itertools;
@@ -20,6 +23,155 @@ use crate::tenant::storage_layer::{
    PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
 };

+/// Warn if the lock was held for longer than this threshold.
+/// It's very generous and we should bring this value down over time.
+const LAYER_MANAGER_LOCK_WARN_THRESHOLD: Duration = Duration::from_secs(5);
+const LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD: Duration = Duration::from_secs(30);
+
+/// Describes the operation that is holding the layer manager lock
+#[derive(Debug, Clone, Copy, strum_macros::Display)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum LayerManagerLockHolder {
+    GetLayerMapInfo,
+    GenerateHeatmap,
+    GetPage,
+    Init,
+    LoadLayerMap,
+    GetLayerForWrite,
+    TryFreezeLayer,
+    FlushFrozenLayer,
+    FlushLoop,
+    Compaction,
+    GarbageCollection,
+    Shutdown,
+    ImportPgData,
+    DetachAncestor,
+    Eviction,
+    #[cfg(test)]
+    Testing,
+}
+
+/// Wrapper for the layer manager that tracks the amount of time during which
+/// it was held under read or write lock
+#[derive(Default)]
+pub(crate) struct LockedLayerManager {
+    locked: tokio::sync::RwLock<LayerManager>,
+}
+
+pub(crate) struct LayerManagerReadGuard<'a> {
+    guard: ManuallyDrop<tokio::sync::RwLockReadGuard<'a, LayerManager>>,
+    acquired_at: std::time::Instant,
+    holder: LayerManagerLockHolder,
+}
+
+pub(crate) struct LayerManagerWriteGuard<'a> {
+    guard: ManuallyDrop<tokio::sync::RwLockWriteGuard<'a, LayerManager>>,
+    acquired_at: std::time::Instant,
+    holder: LayerManagerLockHolder,
+}
+
+impl Drop for LayerManagerReadGuard<'_> {
+    fn drop(&mut self) {
+        // Drop the lock first, before potentially warning if it was held for too long.
+        // SAFETY: ManuallyDrop in Drop implementation
+        unsafe { ManuallyDrop::drop(&mut self.guard) };
+
+        let held_for = self.acquired_at.elapsed();
+        if held_for >= LAYER_MANAGER_LOCK_READ_WARN_THRESHOLD {
+            tracing::warn!(
+                holder=%self.holder,
+                "Layer manager read lock held for {}s",
+                held_for.as_secs_f64(),
+            );
+        }
+    }
+}
+
+impl Drop for LayerManagerWriteGuard<'_> {
+    fn drop(&mut self) {
+        // Drop the lock first, before potentially warning if it was held for too long.
+        // SAFETY: ManuallyDrop in Drop implementation
+        unsafe { ManuallyDrop::drop(&mut self.guard) };
+
+        let held_for = self.acquired_at.elapsed();
+        if held_for >= LAYER_MANAGER_LOCK_WARN_THRESHOLD {
+            tracing::warn!(
+                holder=%self.holder,
+                "Layer manager write lock held for {}s",
+                held_for.as_secs_f64(),
+            );
+        }
+    }
+}
+
+impl Deref for LayerManagerReadGuard<'_> {
+    type Target = LayerManager;
+
+    fn deref(&self) -> &Self::Target {
+        self.guard.deref()
+    }
+}
+
+impl Deref for LayerManagerWriteGuard<'_> {
+    type Target = LayerManager;
+
+    fn deref(&self) -> &Self::Target {
+        self.guard.deref()
+    }
+}
+
+impl DerefMut for LayerManagerWriteGuard<'_> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.guard.deref_mut()
+    }
+}
+
+impl LockedLayerManager {
+    pub(crate) async fn read(&self, holder: LayerManagerLockHolder) -> LayerManagerReadGuard {
+        let guard = ManuallyDrop::new(self.locked.read().await);
+        LayerManagerReadGuard {
+            guard,
+            acquired_at: std::time::Instant::now(),
+            holder,
+        }
+    }
+
+    pub(crate) fn try_read(
+        &self,
+        holder: LayerManagerLockHolder,
+    ) -> Result<LayerManagerReadGuard, tokio::sync::TryLockError> {
+        let guard = ManuallyDrop::new(self.locked.try_read()?);
+
+        Ok(LayerManagerReadGuard {
+            guard,
+            acquired_at: std::time::Instant::now(),
+            holder,
+        })
+    }
+
+    pub(crate) async fn write(&self, holder: LayerManagerLockHolder) -> LayerManagerWriteGuard {
+        let guard = ManuallyDrop::new(self.locked.write().await);
+        LayerManagerWriteGuard {
+            guard,
+            acquired_at: std::time::Instant::now(),
+            holder,
+        }
+    }
+
+    pub(crate) fn try_write(
+        &self,
+        holder: LayerManagerLockHolder,
+    ) -> Result<LayerManagerWriteGuard, tokio::sync::TryLockError> {
+        let guard = ManuallyDrop::new(self.locked.try_write()?);
+
+        Ok(LayerManagerWriteGuard {
+            guard,
+            acquired_at: std::time::Instant::now(),
+            holder,
+        })
+    }
+}
+
 /// Provides semantic APIs to manipulate the layer map.
 pub(crate) enum LayerManager {
    /// Open as in not shutdown layer manager; we still have in-memory layers and we can manipulate
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -21,7 +21,7 @@ OBJS = \
 	unstable_extensions.o \
 	walproposer.o \
 	walproposer_pg.o \
-	control_plane_connector.o \
+	neon_ddl_handler.o \
 	walsender_hooks.o

 PG_CPPFLAGS = -I$(libpq_srcdir)
--- a/pgxn/neon/control_plane_connector.h
+++ b/pgxn/neon/control_plane_connector.h
@@ -1,6 +0,0 @@
-#ifndef CONTROL_PLANE_CONNECTOR_H
-#define CONTROL_PLANE_CONNECTOR_H
-
-void		InitControlPlaneConnector(void);
-
-#endif
--- a/pgxn/neon/neon--1.6--1.5.sql
+++ b/pgxn/neon/neon--1.6--1.5.sql
@@ -2,6 +2,6 @@ DROP FUNCTION IF EXISTS get_prewarm_info(out total_pages integer, out prewarmed_

 DROP FUNCTION IF EXISTS get_local_cache_state(max_chunks integer);

-DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer default 1);
+DROP FUNCTION IF EXISTS prewarm_local_cache(state bytea, n_workers integer);


--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -33,9 +33,9 @@
 #include "extension_server.h"
 #include "file_cache.h"
 #include "neon.h"
+#include "neon_ddl_handler.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
-#include "control_plane_connector.h"
 #include "logical_replication_monitor.h"
 #include "unstable_extensions.h"
 #include "walsender_hooks.h"
@@ -454,7 +454,7 @@ _PG_init(void)

 	InitUnstableExtensionsSupport();
 	InitLogicalReplicationMonitor();
-	InitControlPlaneConnector();
+	InitDDLHandler();

 	pg_init_extension_server();

--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -1,6 +1,6 @@
 # neon extension
 comment = 'cloud storage for PostgreSQL'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/neon'
 relocatable = true
 trusted = true
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
 *
- * control_plane_connector.c
+ * neon_ddl_handler.c
 *	  Captures updates to roles/databases using ProcessUtility_hook and
 *        sends them to the control ProcessUtility_hook. The changes are sent
 *        via HTTP to the URL specified by the GUC neon.console_url when the
@@ -13,18 +13,30 @@
 *        accumulate changes. On subtransaction commit, the top of the stack
 *        is merged with the table below it.
 *
+ *    Support event triggers for neon_superuser
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/neon_dll_handler.c
+ *
 *-------------------------------------------------------------------------
 */

 #include "postgres.h"

 #include <curl/curl.h>
+#include <unistd.h>

 #include "access/xact.h"
+#include "catalog/pg_authid.h"
+#include "catalog/pg_proc.h"
 #include "commands/defrem.h"
+#include "commands/event_trigger.h"
+#include "commands/user.h"
 #include "fmgr.h"
 #include "libpq/crypt.h"
 #include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "parser/parse_func.h"
 #include "tcop/pquery.h"
 #include "tcop/utility.h"
 #include "utils/acl.h"
@@ -32,11 +44,16 @@
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/jsonb.h"
+#include <utils/lsyscache.h>
+#include <utils/syscache.h>

-#include "control_plane_connector.h"
+#include "neon_ddl_handler.h"
 #include "neon_utils.h"

 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
+static fmgr_hook_type next_fmgr_hook = NULL;
+static needs_fmgr_hook_type next_needs_fmgr_hook = NULL;
+static bool neon_event_triggers = true;

 static const char *jwt_token = NULL;

@@ -773,6 +790,7 @@ HandleDropRole(DropRoleStmt *stmt)
 	}
 }

+
 static void
 HandleRename(RenameStmt *stmt)
 {
@@ -782,6 +800,460 @@ HandleRename(RenameStmt *stmt)
 		return HandleRoleRename(stmt);
 }

+
+/*
+ * Support for Event Triggers.
+ *
+ * In vanilla only superuser can create Event Triggers.
+ *
+ * We allow it for neon_superuser by temporary switching to superuser. But as
+ * far as event trigger can fire in superuser context we should protect
+ * superuser from execution of arbitrary user's code.
+ *
+ * The idea was taken from Supabase PR series starting at
+ *   https://github.com/supabase/supautils/pull/98
+ */
+
+static bool
+neon_needs_fmgr_hook(Oid functionId) {
+
+	return (next_needs_fmgr_hook && (*next_needs_fmgr_hook) (functionId))
+		|| get_func_rettype(functionId) == EVENT_TRIGGEROID;
+}
+
+static void
+LookupFuncOwnerSecDef(Oid functionId, Oid *funcOwner, bool *is_secdef)
+{
+	Form_pg_proc procForm;
+	HeapTuple proc_tup = SearchSysCache1(PROCOID, ObjectIdGetDatum(functionId));
+
+	if (!HeapTupleIsValid(proc_tup))
+		ereport(ERROR,
+				(errmsg("cache lookup failed for function %u", functionId)));
+
+	procForm = (Form_pg_proc) GETSTRUCT(proc_tup);
+
+	*funcOwner = procForm->proowner;
+	*is_secdef = procForm->prosecdef;
+
+	ReleaseSysCache(proc_tup);
+}
+
+
+PG_FUNCTION_INFO_V1(noop);
+Datum noop(__attribute__ ((unused)) PG_FUNCTION_ARGS) { PG_RETURN_VOID();}
+
+static void
+force_noop(FmgrInfo *finfo)
+{
+    finfo->fn_addr   = (PGFunction) noop;
+    finfo->fn_oid    = InvalidOid;           /* not a known function OID anymore */
+    finfo->fn_nargs  = 0;                    /* no arguments for noop */
+    finfo->fn_strict = false;
+    finfo->fn_retset = false;
+    finfo->fn_stats  = 0;                    /* no stats collection */
+    finfo->fn_extra  = NULL;                 /* clear out old context data */
+    finfo->fn_mcxt   = CurrentMemoryContext;
+    finfo->fn_expr   = NULL;                 /* no parse tree */
+}
+
+
+/*
+ * Skip executing Event Triggers execution for superusers, because Event
+ * Triggers are SECURITY DEFINER and user provided code could then attempt
+ * privilege escalation.
+ *
+ * Also skip executing Event Triggers when GUC neon.event_triggers has been
+ * set to false. This might be necessary to be able to connect again after a
+ * LOGIN Event Trigger has been installed that would prevent connections as
+ * neon_superuser.
+ */
+static void
+neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
+{
+	/*
+	 * It can be other needs_fmgr_hook which cause our hook to be invoked for
+	 * non-trigger function, so recheck that is is trigger function.
+	 */
+	if (flinfo->fn_oid != InvalidOid &&
+		get_func_rettype(flinfo->fn_oid) != EVENT_TRIGGEROID)
+	{
+		if (next_fmgr_hook)
+			(*next_fmgr_hook) (event, flinfo, private);
+
+		return;
+	}
+
+	/*
+	 * The neon_superuser role can use the GUC neon.event_triggers to disable
+	 * firing Event Trigger.
+	 *
+	 *   SET neon.event_triggers TO false;
+	 *
+	 * This only applies to the neon_superuser role though, and only allows
+	 * skipping Event Triggers owned by neon_superuser, which we check by
+	 * proxy of the Event Trigger function being owned by neon_superuser.
+	 *
+	 * A role that is created in role neon_superuser should be allowed to also
+	 * benefit from the neon_event_triggers GUC, and will be considered the
+	 * same as the neon_superuser role.
+	 */
+	if (event == FHET_START
+		&& !neon_event_triggers
+		&& is_neon_superuser())
+	{
+		Oid neon_superuser_oid = get_role_oid("neon_superuser", false);
+
+		/* Find the Function Attributes (owner Oid, security definer) */
+		const char *fun_owner_name = NULL;
+		Oid fun_owner = InvalidOid;
+		bool fun_is_secdef = false;
+
+		LookupFuncOwnerSecDef(flinfo->fn_oid, &fun_owner, &fun_is_secdef);
+		fun_owner_name = GetUserNameFromId(fun_owner, false);
+
+		if (RoleIsNeonSuperuser(fun_owner_name)
+			|| has_privs_of_role(fun_owner, neon_superuser_oid))
+		{
+			elog(WARNING,
+				 "Skipping Event Trigger: neon.event_triggers is false");
+
+			/*
+			 * we can't skip execution directly inside the fmgr_hook so instead we
+			 * change the event trigger function to a noop function.
+			 */
+			force_noop(flinfo);
+		}
+	}
+
+	/*
+	 * Fire Event Trigger if both function owner and current user are
+	 * superuser, or none of them are.
+	 */
+    else if (event == FHET_START
+		/* still enable it to pass pg_regress tests */
+		&& !RegressTestMode)
+	{
+		/*
+		 * Get the current user oid as of before SECURITY DEFINER change of
+		 * CurrentUserId, and that would be SessionUserId.
+		 */
+		Oid current_role_oid = GetSessionUserId();
+		bool role_is_super = superuser_arg(current_role_oid);
+
+		/* Find the Function Attributes (owner Oid, security definer) */
+		Oid function_owner = InvalidOid;
+		bool function_is_secdef = false;
+		bool function_is_owned_by_super = false;
+
+		LookupFuncOwnerSecDef(flinfo->fn_oid, &function_owner, &function_is_secdef);
+
+		function_is_owned_by_super = superuser_arg(function_owner);
+
+		/*
+		 * 1. Refuse to run SECURITY DEFINER function that belongs to a
+		 * superuser when the current user is not a superuser itself.
+		 */
+		if (!role_is_super
+			&& function_is_owned_by_super
+			&& function_is_secdef)
+		{
+			char *func_name = get_func_name(flinfo->fn_oid);
+
+			ereport(WARNING,
+					(errmsg("Skipping Event Trigger"),
+					 errdetail("Event Trigger function \"%s\" is owned by \"%s\" "
+							   "and is SECURITY DEFINER",
+							   func_name,
+							   GetUserNameFromId(function_owner, false))));
+
+			/*
+			 * we can't skip execution directly inside the fmgr_hook so
+			 * instead we change the event trigger function to a noop
+			 * function.
+			 */
+			force_noop(flinfo);
+		}
+
+		/*
+		 * 2. Refuse to run functions that belongs to a non-superuser when the
+		 * current user is a superuser.
+		 *
+		 * We could run a SECURITY DEFINER user-function here and be safe with
+		 * privilege escalation risks, but superuser roles are only used for
+		 * infrastructure maintenance operations, where we prefer to skip
+		 * running user-defined code.
+		 */
+		else if (role_is_super && !function_is_owned_by_super)
+		{
+			char *func_name = get_func_name(flinfo->fn_oid);
+
+			ereport(WARNING,
+					(errmsg("Skipping Event Trigger"),
+					 errdetail("Event Trigger function \"%s\" "
+							   "is owned by non-superuser role \"%s\", "
+							   "and current_user \"%s\" is superuser",
+							   func_name,
+							   GetUserNameFromId(function_owner, false),
+							   GetUserNameFromId(current_role_oid, false))));
+
+			/*
+			 * we can't skip execution directly inside the fmgr_hook so
+			 * instead we change the event trigger function to a noop
+			 * function.
+			 */
+			force_noop(flinfo);
+		}
+
+	}
+
+	if (next_fmgr_hook)
+		(*next_fmgr_hook) (event, flinfo, private);
+}
+
+static Oid prev_role_oid = 0;
+static int prev_role_sec_context = 0;
+static bool switched_to_superuser = false;
+
+/*
+ * Switch tp superuser if not yet superuser.
+ * Returns false if already switched to superuser.
+ */
+static bool
+switch_to_superuser(void)
+{
+    Oid superuser_oid;
+
+	if (switched_to_superuser)
+		return false;
+	switched_to_superuser = true;
+
+	superuser_oid = get_role_oid("cloud_admin", true /*missing_ok*/);
+	if (superuser_oid == InvalidOid)
+		superuser_oid = BOOTSTRAP_SUPERUSERID;
+
+    GetUserIdAndSecContext(&prev_role_oid, &prev_role_sec_context);
+    SetUserIdAndSecContext(superuser_oid, prev_role_sec_context |
+                                              SECURITY_LOCAL_USERID_CHANGE |
+                                              SECURITY_RESTRICTED_OPERATION);
+	return true;
+}
+
+static void
+switch_to_original_role(void)
+{
+    SetUserIdAndSecContext(prev_role_oid, prev_role_sec_context);
+    switched_to_superuser = false;
+}
+
+/*
+ * ALTER ROLE ... SUPERUSER;
+ *
+ * Used internally to give superuser to a non-privileged role to allow
+ * ownership of superuser-only objects such as Event Trigger.
+ *
+ *   ALTER ROLE foo SUPERUSER;
+ *   ALTER EVENT TRIGGER ... OWNED BY foo;
+ *   ALTER ROLE foo NOSUPERUSER;
+ *
+ * Now the EVENT TRIGGER is owned by foo, who can DROP it without having to be
+ * superuser again.
+ */
+static void
+alter_role_super(const char* rolename, bool make_super)
+{
+	AlterRoleStmt *alter_stmt = makeNode(AlterRoleStmt);
+
+	DefElem *defel_superuser =
+#if PG_MAJORVERSION_NUM <= 14
+		makeDefElem("superuser", (Node *) makeInteger(make_super), -1);
+#else
+		makeDefElem("superuser", (Node *) makeBoolean(make_super), -1);
+#endif
+
+	RoleSpec *rolespec   = makeNode(RoleSpec);
+	rolespec->roletype   = ROLESPEC_CSTRING;
+	rolespec->rolename   = pstrdup(rolename);
+	rolespec->location   = -1;
+
+	alter_stmt->role = rolespec;
+	alter_stmt->options = list_make1(defel_superuser);
+
+#if PG_MAJORVERSION_NUM < 15
+	AlterRole(alter_stmt);
+#else
+	/* ParseState *pstate, AlterRoleStmt *stmt */
+	AlterRole(NULL, alter_stmt);
+#endif
+
+	CommandCounterIncrement();
+}
+
+
+/*
+ * Changes the OWNER of an Event Trigger.
+ *
+ * Event Triggers can only be owned by superusers, so this ALTER ROLE with
+ * SUPERUSER and then removes the property.
+ */
+static void
+alter_event_trigger_owner(const char *obj_name, Oid role_oid)
+{
+	char* role_name = GetUserNameFromId(role_oid, false);
+
+	alter_role_super(role_name, true);
+
+	AlterEventTriggerOwner(obj_name, role_oid);
+	CommandCounterIncrement();
+
+	alter_role_super(role_name, false);
+}
+
+
+/*
+ * Neon processing of the CREATE EVENT TRIGGER requires special attention and
+ * is worth having its own ProcessUtility_hook for that.
+ */
+static void
+ProcessCreateEventTrigger(
+				   PlannedStmt *pstmt,
+				   const char *queryString,
+				   bool readOnlyTree,
+				   ProcessUtilityContext context,
+				   ParamListInfo params,
+				   QueryEnvironment *queryEnv,
+				   DestReceiver *dest,
+				   QueryCompletion *qc)
+{
+	Node	   *parseTree = pstmt->utilityStmt;
+	bool		sudo = false;
+
+	/* We double-check that after local variable declaration block */
+	CreateEventTrigStmt *stmt = (CreateEventTrigStmt *) parseTree;
+
+	/*
+	 * We are going to change the current user privileges (sudo) and might
+	 * need after execution cleanup. For that we want to capture the UserId
+	 * before changing it for our sudo implementation.
+	 */
+	const Oid current_user_id = GetUserId();
+	bool current_user_is_super = superuser_arg(current_user_id);
+
+	if (nodeTag(parseTree) != T_CreateEventTrigStmt)
+	{
+		ereport(ERROR,
+				errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("ProcessCreateEventTrigger called for the wrong command"));
+	}
+
+	/*
+	 * Allow neon_superuser to create Event Trigger, while keeping the
+	 * ownership of the object.
+	 *
+	 * For that we give superuser membership to the role for the execution of
+	 * the command.
+	 */
+	if (IsTransactionState() && is_neon_superuser())
+	{
+		/* Find the Event Trigger function Oid */
+		Oid func_oid = LookupFuncName(stmt->funcname, 0, NULL, false);
+
+		/* Find the Function Owner Oid */
+		Oid func_owner = InvalidOid;
+		bool is_secdef = false;
+		bool function_is_owned_by_super = false;
+
+		LookupFuncOwnerSecDef(func_oid, &func_owner, &is_secdef);
+
+		function_is_owned_by_super = superuser_arg(func_owner);
+
+		if(!current_user_is_super && function_is_owned_by_super)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+					 errmsg("Permission denied to execute "
+							"a function owned by a superuser role"),
+					 errdetail("current user \"%s\" is not a superuser "
+							   "and Event Trigger function \"%s\" "
+							   "is owned by a superuser",
+							   GetUserNameFromId(current_user_id, false),
+							   NameListToString(stmt->funcname))));
+		}
+
+		if(current_user_is_super && !function_is_owned_by_super)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+					 errmsg("Permission denied to execute "
+							"a function owned by a non-superuser role"),
+					 errdetail("current user \"%s\" is a superuser "
+							   "and function \"%s\" is "
+							   "owned by a non-superuser",
+							   GetUserNameFromId(current_user_id, false),
+							   NameListToString(stmt->funcname))));
+		}
+
+		sudo = switch_to_superuser();
+	}
+
+	PG_TRY();
+	{
+		if (PreviousProcessUtilityHook)
+		{
+			PreviousProcessUtilityHook(
+				pstmt,
+				queryString,
+				readOnlyTree,
+				context,
+				params,
+				queryEnv,
+				dest,
+				qc);
+		}
+		else
+		{
+			standard_ProcessUtility(
+				pstmt,
+				queryString,
+				readOnlyTree,
+				context,
+				params,
+				queryEnv,
+				dest,
+				qc);
+		}
+
+		/*
+		 * Now that the Event Trigger has been installed via our sudo
+		 * mechanism, if the original role was not a superuser then change
+		 * the event trigger ownership back to the original role.
+		 *
+		 * That way [ ALTER | DROP ] EVENT TRIGGER commands just work.
+		 */
+		if (IsTransactionState() && is_neon_superuser())
+		{
+			if (!current_user_is_super)
+			{
+				/*
+				 * Change event trigger owner to the current role (making
+				 * it a privileged role during the ALTER OWNER command).
+				 */
+				alter_event_trigger_owner(stmt->trigname, current_user_id);
+			}
+		}
+	}
+	PG_FINALLY();
+	{
+		if (sudo)
+			switch_to_original_role();
+	}
+	PG_END_TRY();
+}
+
+
+/*
+ * Neon hooks for DDLs (handling privileges, limiting features, etc).
+ */
 static void
 NeonProcessUtility(
 				   PlannedStmt *pstmt,
@@ -795,6 +1267,27 @@ NeonProcessUtility(
 {
 	Node	   *parseTree = pstmt->utilityStmt;

+	/*
+	 * The process utility hook for CREATE EVENT TRIGGER is its own
+	 * implementation and warrant being addressed separately from here.
+	 */
+	if (nodeTag(parseTree) == T_CreateEventTrigStmt)
+	{
+		ProcessCreateEventTrigger(
+				pstmt,
+				queryString,
+				readOnlyTree,
+				context,
+				params,
+				queryEnv,
+				dest,
+				qc);
+		return;
+	}
+
+	/*
+	 * Other commands that need Neon specific implementations are handled here:
+	 */
 	switch (nodeTag(parseTree))
 	{
 		case T_CreatedbStmt:
@@ -833,37 +1326,82 @@ NeonProcessUtility(
 	if (PreviousProcessUtilityHook)
 	{
 		PreviousProcessUtilityHook(
-								   pstmt,
-								   queryString,
-								   readOnlyTree,
-								   context,
-								   params,
-								   queryEnv,
-								   dest,
-								   qc);
+			pstmt,
+			queryString,
+			readOnlyTree,
+			context,
+			params,
+			queryEnv,
+			dest,
+			qc);
 	}
 	else
 	{
 		standard_ProcessUtility(
-								pstmt,
-								queryString,
-								readOnlyTree,
-								context,
-								params,
-								queryEnv,
-								dest,
-								qc);
+			pstmt,
+			queryString,
+			readOnlyTree,
+			context,
+			params,
+			queryEnv,
+			dest,
+			qc);
 	}
 }

+/*
+ * Only neon_superuser is granted privilege to edit neon.event_triggers GUC.
+ */
+static void
+neon_event_triggers_assign_hook(bool newval, void *extra)
+{
+	/* MyDatabaseId == InvalidOid || !OidIsValid(GetUserId())	 */
+
+	if (IsTransactionState() && !is_neon_superuser())
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied to set neon.event_triggers"),
+				 errdetail("Only \"neon_superuser\" is allowed to set the GUC")));
+	}
+}
+
+
 void
-InitControlPlaneConnector()
+InitDDLHandler()
 {
 	PreviousProcessUtilityHook = ProcessUtility_hook;
 	ProcessUtility_hook = NeonProcessUtility;
+
+    next_needs_fmgr_hook = needs_fmgr_hook;
+	needs_fmgr_hook = neon_needs_fmgr_hook;
+
+	next_fmgr_hook = fmgr_hook;
+	fmgr_hook = neon_fmgr_hook;
+
 	RegisterXactCallback(NeonXactCallback, NULL);
 	RegisterSubXactCallback(NeonSubXactCallback, NULL);

+	/*
+	 * The GUC neon.event_triggers should provide the same effect as the
+	 * Postgres GUC event_triggers, but the neon one is PGC_USERSET.
+	 *
+	 * This allows using the GUC in the connection string and work out of a
+	 * LOGIN Event Trigger that would break database access, all without
+	 * having to edit and reload the Postgres configuration file.
+	 */
+	DefineCustomBoolVariable(
+							 "neon.event_triggers",
+							 "Enable firing of event triggers",
+							 NULL,
+							 &neon_event_triggers,
+							 true,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 neon_event_triggers_assign_hook,
+							 NULL);
+
 	DefineCustomStringVariable(
 							   "neon.console_url",
 							   "URL of the Neon Console, which will be forwarded changes to dbs and roles",
--- a/pgxn/neon/neon_ddl_handler.h
+++ b/pgxn/neon/neon_ddl_handler.h
@@ -0,0 +1,6 @@
+#ifndef CONTROL_DDL_HANDLER_H
+#define CONTROL_DDL_HANDLER_H
+
+void		InitDDLHandler(void);
+
+#endif
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -679,8 +679,7 @@ typedef struct walproposer_api
 	 * Finish sync safekeepers with the given LSN. This function should not
 	 * return and should exit the program.
 	 */
-	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
-
+	void		(*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn) __attribute__((noreturn)) ;
 	/*
 	 * Called after every AppendResponse from the safekeeper. Used to
 	 * propagate backpressure feedback and to confirm WAL persistence (has
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1890,7 +1890,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 	return rc;
 }

-static void
+static void __attribute__((noreturn))
 walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
 {
 	fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn));
--- a/poetry.lock
+++ b/poetry.lock
@@ -3051,19 +3051,19 @@ files = [

 [[package]]
 name = "requests"
-version = "2.32.3"
+version = "2.32.4"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
-    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
+    {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"},
+    {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"},
 ]

 [package.dependencies]
 certifi = ">=2017.4.17"
-charset-normalizer = ">=2,<4"
+charset_normalizer = ">=2,<4"
 idna = ">=2.5,<4"
 urllib3 = ">=1.21.1,<3"

@@ -3846,4 +3846,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "7ab1e7b975af34b3271b7c6018fa22a261d3f73c7c0a0403b6b2bb86b5fbd36e"
+content-hash = "bd93313f110110aa53b24a3ed47ba2d7f60e2c658a79cdff7320fed1bb1b57b5"
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -138,3 +138,29 @@ Now from client you can start a new session:
 ```sh
 PGSSLROOTCERT=./server.crt psql  "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full"
 ```
+
+## auth broker hacky setup:
+
+```sh
+docker run \
+  --detach \
+  --name proxy-postgres \
+  --env POSTGRES_HOST_AUTH_METHOD=trust \
+  --env POSTGRES_USER=authenticated \
+  --env POSTGRES_DB=database \
+  --publish 5432:5432 \
+  postgres:17-bookworm
+```
+
+```sh
+cargo run --bin proxy -- --is-auth-broker true -c server.crt -k server.key --wss 0.0.0.0:8080 --http 0.0.0.0:7002 --auth-backend cplane-v1
+```
+
+```sh
+cargo run --bin local_proxy -- --http 0.0.0.0:7432
+```
+
+```sh
+export NEON_JWT="..."
+curl -k "https://127.0.0.1:8080/sql" -H "Authorization: Bearer $NEON_JWT" -H "neon-connection-string: postgresql://authenticated@foo.local.neon.build/database" -d '{"query":"select 1","params":[]}'
+```
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -4,6 +4,8 @@ use std::sync::Arc;
 use std::time::{Duration, SystemTime};

 use arc_swap::ArcSwapOption;
+use base64::Engine as _;
+use base64::prelude::BASE64_URL_SAFE_NO_PAD;
 use clashmap::ClashMap;
 use jose_jwk::crypto::KeyInfo;
 use reqwest::{Client, redirect};
@@ -347,17 +349,17 @@ impl JwkCacheEntryLock {
            .split_once('.')
            .ok_or(JwtEncodingError::InvalidCompactForm)?;

-        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?;
+        let header = BASE64_URL_SAFE_NO_PAD.decode(header)?;
        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)?;

-        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
+        let payloadb = BASE64_URL_SAFE_NO_PAD.decode(payload)?;
        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;

        if let Some(iss) = &payload.issuer {
            ctx.set_jwt_issuer(iss.as_ref().to_owned());
        }

-        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?;
+        let sig = BASE64_URL_SAFE_NO_PAD.decode(signature)?;

        let kid = header.key_id.ok_or(JwtError::MissingKeyId)?;

@@ -796,7 +798,6 @@ mod tests {
    use std::net::SocketAddr;
    use std::time::SystemTime;

-    use base64::URL_SAFE_NO_PAD;
    use bytes::Bytes;
    use http::Response;
    use http_body_util::Full;
@@ -871,9 +872,8 @@ mod tests {
            key_id: Some(Cow::Owned(kid)),
        };

-        let header =
-            base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
-        let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD);
+        let header = BASE64_URL_SAFE_NO_PAD.encode(serde_json::to_string(&header).unwrap());
+        let body = BASE64_URL_SAFE_NO_PAD.encode(serde_json::to_string(&body).unwrap());

        format!("{header}.{body}")
    }
@@ -883,7 +883,7 @@ mod tests {

        let payload = build_jwt_payload(kid, jose_jwa::Signing::Es256);
        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
-        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+        let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes());

        format!("{payload}.{sig}")
    }
@@ -893,7 +893,7 @@ mod tests {

        let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256);
        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
-        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+        let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes());

        format!("{payload}.{sig}")
    }
@@ -904,7 +904,7 @@ mod tests {

        let payload = build_jwt_payload(kid, jose_jwa::Signing::Rs256);
        let sig = SigningKey::<sha2::Sha256>::new(key).sign(payload.as_bytes());
-        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+        let sig = BASE64_URL_SAFE_NO_PAD.encode(sig.to_bytes());

        format!("{payload}.{sig}")
    }
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -11,11 +11,13 @@ use anyhow::Context;
 use anyhow::{bail, ensure};
 use arc_swap::ArcSwapOption;
 use futures::future::Either;
+use itertools::{Itertools, Position};
+use rand::{Rng, thread_rng};
 use remote_storage::RemoteStorageConfig;
 use tokio::net::TcpListener;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::{Instrument, info, warn};
+use tracing::{Instrument, error, info, warn};
 use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version};

@@ -314,7 +316,7 @@ pub async fn run() -> anyhow::Result<()> {
    let jemalloc = match crate::jemalloc::MetricRecorder::new() {
        Ok(t) => Some(t),
        Err(e) => {
-            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            error!(error = ?e, "could not start jemalloc metrics loop");
            None
        }
    };
@@ -520,23 +522,44 @@ pub async fn run() -> anyhow::Result<()> {
                }
            }

+            // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
+            // This prevents immediate exit and pod restart,
+            // which can cause hammering of the redis in case of connection issues.
            if let Some(mut redis_kv_client) = redis_kv_client {
-                maintenance_tasks.spawn(async move {
-                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(
-                        &mut redis_kv_client,
-                        rx_cancel,
-                        args.cancellation_batch_size,
-                    )
-                    .await?;
+                for attempt in (0..3).with_position() {
+                    match redis_kv_client.try_connect().await {
+                        Ok(()) => {
+                            info!("Connected to Redis KV client");
+                            maintenance_tasks.spawn(async move {
+                                handle_cancel_messages(
+                                    &mut redis_kv_client,
+                                    rx_cancel,
+                                    args.cancellation_batch_size,
+                                )
+                                .await?;

-                    drop(redis_kv_client);
+                                drop(redis_kv_client);

-                    // `handle_cancel_messages` was terminated due to the tx_cancel
-                    // being dropped. this is not worthy of an error, and this task can only return `Err`,
-                    // so let's wait forever instead.
-                    std::future::pending().await
-                });
+                                // `handle_cancel_messages` was terminated due to the tx_cancel
+                                // being dropped. this is not worthy of an error, and this task can only return `Err`,
+                                // so let's wait forever instead.
+                                std::future::pending().await
+                            });
+                            break;
+                        }
+                        Err(e) => {
+                            error!("Failed to connect to Redis KV client: {e}");
+                            if matches!(attempt, Position::Last(_)) {
+                                bail!(
+                                    "Failed to connect to Redis KV client after {} attempts",
+                                    attempt.into_inner()
+                                );
+                            }
+                            let jitter = thread_rng().gen_range(0..100);
+                            tokio::time::sleep(Duration::from_millis(1000 + jitter)).await;
+                        }
+                    }
+                }
            }

            if let Some(regional_redis_client) = regional_redis_client {
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -1,6 +1,6 @@
 //! Production console backend.

-use std::net::IpAddr;
+use std::net::{IpAddr, Ipv4Addr};
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
@@ -17,20 +17,21 @@ use tracing::{Instrument, debug, info, info_span, warn};
 use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::backend::jwt::AuthRule;
+use crate::compute::ConnectInfo;
 use crate::context::RequestContext;
 use crate::control_plane::caches::ApiCaches;
 use crate::control_plane::errors::{
    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo, Reason};
 use crate::control_plane::{
    AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
    RoleAccessControl,
 };
 use crate::metrics::Metrics;
 use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::types::{EndpointCacheKey, EndpointId, RoleName};
+use crate::types::{BranchId, EndpointCacheKey, EndpointId, ProjectId, RoleName};
 use crate::{compute, http, scram};

 pub(crate) const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
@@ -388,6 +389,17 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
        ctx: &RequestContext,
        endpoint: &EndpointId,
    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        if true {
+            return Ok(vec![AuthRule {
+                id: "1".into(),
+                jwks_url: "https://adapted-gorilla-88.clerk.accounts.dev/.well-known/jwks.json"
+                    .parse()
+                    .expect("url is valid"),
+                audience: None,
+                role_names: vec![(&RoleName::from("authenticated")).into()],
+            }]);
+        }
+
        self.do_get_endpoint_jwks(ctx, endpoint).await
    }

@@ -397,6 +409,24 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
        ctx: &RequestContext,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
+        if true {
+            return Ok(CachedNodeInfo::new_uncached(NodeInfo {
+                conn_info: ConnectInfo {
+                    host_addr: Some(IpAddr::V4(Ipv4Addr::LOCALHOST)),
+                    host: "localhost".into(),
+                    port: 7432,
+                    ssl_mode: SslMode::Disable,
+                },
+                aux: MetricsAuxInfo {
+                    endpoint_id: EndpointId::from("foo").into(),
+                    project_id: ProjectId::from("foo").into(),
+                    branch_id: BranchId::from("foo").into(),
+                    compute_id: "foo".into(),
+                    cold_start_info: ColdStartInfo::Warm,
+                },
+            }));
+        }
+
        let key = user_info.endpoint_cache_key();

        macro_rules! check_cache {
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`pub const PGBOUNCER_PIDFILE: &str = "/tmp/pgbouncer.pid";`