Restart already running rsyslog to update configuration

spawn rsyslog from sysvInit
Fix start rsyslog code to run for read endpoints too
2026-07-08 14:40:37 +00:00 · 2025-03-05 16:55:57 +00:00 · 2025-03-05 16:50:31 +00:00 · 2025-03-05 15:13:37 +00:00 · 2025-03-05 12:56:32 +00:00 · 2025-03-05 12:27:39 +00:00
132 changed files with 4864 additions and 2180 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -32,3 +32,4 @@ config-variables:
  - NEON_DEV_AWS_ACCOUNT_ID
  - NEON_PROD_AWS_ACCOUNT_ID
  - AWS_ECR_REGION
+  - BENCHMARK_LARGE_OLTP_PROJECTID
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -84,7 +84,13 @@ runs:
          --header "Authorization: Bearer ${API_KEY}"
          )

-        role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
+        role_name=$(echo "$roles" | jq --raw-output '
+          (.roles | map(select(.protected == false))) as $roles |
+          if any($roles[]; .name == "neondb_owner")
+          then "neondb_owner"
+          else $roles[0].name
+          end
+        ')
        echo "role_name=${role_name}" >> $GITHUB_OUTPUT
      env:
        API_HOST: ${{ inputs.api_host }}
@@ -107,13 +113,13 @@ runs:
            )

          if [ -z "${reset_password}" ]; then
-            sleep 1
+            sleep $i
            continue
          fi

          password=$(echo $reset_password | jq --raw-output '.role.password')
          if [ "${password}" == "null" ]; then
-            sleep 1
+            sleep $i # increasing backoff
            continue
          fi

--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,11 @@ inputs:
    description: 'Postgres version to use for tests'
    required: false
    default: 'v16'
+  sanitizers:
+    description: 'enabled or disabled'
+    required: false
+    default: 'disabled'
+    type: string
  benchmark_durations:
    description: 'benchmark durations JSON'
    required: false
@@ -59,7 +64,7 @@ runs:
      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
        path: /tmp/neon
        aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}

@@ -112,6 +117,7 @@ runs:
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
        RERUN_FAILED: ${{ inputs.rerun_failed }}
        PG_VERSION: ${{ inputs.pg_version }}
+        SANITIZERS: ${{ inputs.sanitizers }}
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -280,7 +280,7 @@ jobs:
      - name: Upload Neon artifact
        uses: ./.github/actions/upload
        with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
          path: /tmp/neon
          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

@@ -347,6 +347,7 @@ jobs:
          real_s3_region: eu-central-1
          rerun_failed: true
          pg_version: ${{ matrix.pg_version }}
+          sanitizers: ${{ inputs.sanitizers }}
          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
          # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
          # Attempt to stop tests gracefully to generate test reports
@@ -359,7 +360,6 @@ jobs:
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
-          SANITIZERS: ${{ inputs.sanitizers }}

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -46,7 +46,7 @@ jobs:
        env:
          RUN_KIND: >-
            ${{
-              'storage-rc-pr'
+              false
              || (inputs.github-event-name == 'push'         && github.ref_name == 'main')            && 'push-main'
              || (inputs.github-event-name == 'push'         && github.ref_name == 'release')         && 'storage-release'
              || (inputs.github-event-name == 'push'         && github.ref_name == 'release-compute') && 'compute-release'
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -141,6 +141,8 @@ jobs:
          --ignore test_runner/performance/test_physical_replication.py
          --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
          --ignore test_runner/performance/test_cumulative_statistics_persistence.py
+          --ignore test_runner/performance/test_perf_many_relations.py
+          --ignore test_runner/performance/test_perf_oltp_large_tenant.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -692,15 +692,15 @@ jobs:
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64

-  vm-compute-node-image:
+  vm-compute-node-image-arch:
    needs: [ check-permissions, meta, compute-node-image ]
    if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
-    runs-on: [ self-hosted, large ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
    strategy:
      fail-fast: false
      matrix:
+        arch: [ amd64, arm64 ]
        version:
-          # see the comment for `compute-node-image-arch` job
          - pg: v14
            debian: bullseye
          - pg: v15
@@ -717,7 +717,7 @@ jobs:

      - name: Downloading vm-builder
        run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
+          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
          chmod +x vm-builder

      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -738,12 +738,37 @@ jobs:
            -size=2G \
            -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
            -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-            -target-arch=linux/amd64
+            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
+            -target-arch=linux/${{ matrix.arch }}

      - name: Pushing vm-compute-node image
        run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
+          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
+
+  vm-compute-node-image:
+    needs: [ vm-compute-node-image-arch, meta ]
+    if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+          - pg: v15
+          - pg: v16
+          - pg: v17
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch compute-node image
+        run: |
+          docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
+                                             neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
+                                             neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
+

  test-images:
    needs: [ check-permissions, meta, neon-image, compute-node-image ]
--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -52,8 +52,9 @@ jobs:
      - name: Test extension upgrade
        timeout-minutes: 20
        env:
-          NEWTAG: latest
-          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          NEW_COMPUTE_TAG: latest
+          OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
          PG_VERSION: ${{ matrix.pg-version }}
          FORCE_ALL_UPGRADE_TESTS: true
        run: ./docker-compose/test_extensions_upgrade.sh
--- a/.github/workflows/large_oltp_benchmark.yml
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -0,0 +1,147 @@
+name: large oltp benchmark
+
+on:
+  # uncomment to run on push for debugging your PR
+  push:
+    branches: [ bodobolero/synthetic_oltp_workload ]
+
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │  ┌───────────── day of the month (1 - 31)
+    #          │ │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: large-oltp-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  oltp:
+    strategy:
+      fail-fast: false # allow other variants to continue even if one fails
+      matrix:
+        include:
+          - target: new_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+          - target: reuse_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h 
+      TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      PG_VERSION: 16 # pre-determined by pre-determined project
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
+      PLATFORM: ${{ matrix.target }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    # Increase timeout to 8h, default timeout is 6h
+    timeout-minutes: 480
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target == 'new_branch' }}
+      id: create-neon-branch-oltp-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+          project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+          case "${{ matrix.target }}" in
+              new_branch)
+              CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
+              ;;
+              reuse_branch)
+              CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
+              ;;
+              *)
+              echo >&2 "Unknown target=${{ matrix.target }}"
+              exit 1
+              ;;
+          esac
+
+          echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+    - name: Benchmark pgbench with custom-scripts
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
+        pg_version: ${{ env.PG_VERSION }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target == 'new_branch' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+  
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Periodic large oltp perf testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -78,8 +78,10 @@ jobs:
      run: |
        if [ -z "$INPUT_COMMIT_HASH" ]; then
          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
        else
          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
        fi

    - name: Start Bench with run_id
@@ -89,7 +91,7 @@ jobs:
        -H 'accept: application/json' \
        -H 'Content-Type: application/json' \
        -H "Authorization: Bearer $API_KEY" \
-        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
+        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"

    - name: Poll Test Status
      id: poll_step
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -783,6 +783,28 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum-extra"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
+dependencies = [
+ "axum",
+ "axum-core",
+ "bytes",
+ "futures-util",
+ "headers",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "serde",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "azure_core"
 version = "0.21.0"
@@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"

 [[package]]
 name = "base64"
-version = "0.21.1"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"

 [[package]]
 name = "base64"
@@ -1305,6 +1327,7 @@ dependencies = [
 "aws-sdk-s3",
 "aws-smithy-types",
 "axum",
+ "axum-extra",
 "base64 0.13.1",
 "bytes",
 "camino",
@@ -1316,6 +1339,7 @@ dependencies = [
 "flate2",
 "futures",
 "http 1.1.0",
+ "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
 "notify",
@@ -2297,7 +2321,7 @@ name = "framed-websockets"
 version = "0.1.0"
 source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 "bytemuck",
 "bytes",
 "futures-core",
@@ -2410,9 +2434,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"

 [[package]]
 name = "futures-timer"
-version = "3.0.2"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
+checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"

 [[package]]
 name = "futures-util"
@@ -2515,6 +2539,27 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"

+[[package]]
+name = "governor"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
+dependencies = [
+ "cfg-if",
+ "dashmap 6.1.0",
+ "futures-sink",
+ "futures-timer",
+ "futures-util",
+ "no-std-compat",
+ "nonzero_ext",
+ "parking_lot 0.12.1",
+ "portable-atomic",
+ "quanta",
+ "rand 0.8.5",
+ "smallvec",
+ "spinning_top",
+]
+
 [[package]]
 name = "group"
 version = "0.12.1"
@@ -2632,7 +2677,7 @@ version = "7.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 "byteorder",
 "crossbeam-channel",
 "flate2",
@@ -2640,6 +2685,30 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "headers"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
+dependencies = [
+ "base64 0.21.7",
+ "bytes",
+ "headers-core",
+ "http 1.1.0",
+ "httpdate",
+ "mime",
+ "sha1",
+]
+
+[[package]]
+name = "headers-core"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
+dependencies = [
+ "http 1.1.0",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2777,12 +2846,9 @@ name = "http-utils"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "backtrace",
 "bytes",
 "fail",
- "flate2",
 "hyper 0.14.30",
- "inferno 0.12.0",
 "itertools 0.10.5",
 "jemalloc_pprof",
 "metrics",
@@ -3281,9 +3347,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"

 [[package]]
 name = "jemalloc_pprof"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
+checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
 dependencies = [
 "anyhow",
 "libc",
@@ -3367,7 +3433,7 @@ version = "9.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 "js-sys",
 "pem",
 "ring",
@@ -3482,9 +3548,9 @@ dependencies = [

 [[package]]
 name = "mappings"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
+checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
 dependencies = [
 "anyhow",
 "libc",
@@ -3725,6 +3791,12 @@ dependencies = [
 "memoffset 0.9.0",
 ]

+[[package]]
+name = "no-std-compat"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -3735,6 +3807,12 @@ dependencies = [
 "minimal-lexical",
 ]

+[[package]]
+name = "nonzero_ext"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
+
 [[package]]
 name = "notify"
 version = "8.0.0"
@@ -4307,9 +4385,9 @@ dependencies = [

 [[package]]
 name = "papaya"
-version = "0.1.8"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
 dependencies = [
 "equivalent",
 "seize",
@@ -4437,7 +4515,7 @@ version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 "serde",
 ]

@@ -4591,6 +4669,12 @@ dependencies = [
 "never-say-never",
 ]

+[[package]]
+name = "portable-atomic"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
+
 [[package]]
 name = "postgres"
 version = "0.19.7"
@@ -4755,12 +4839,14 @@ dependencies = [

 [[package]]
 name = "pprof_util"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
+checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
 dependencies = [
 "anyhow",
+ "backtrace",
 "flate2",
+ "inferno 0.12.0",
 "num",
 "paste",
 "prost",
@@ -5052,6 +5138,21 @@ dependencies = [
 "zerocopy",
 ]

+[[package]]
+name = "quanta"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quick-xml"
 version = "0.26.0"
@@ -5182,6 +5283,15 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "raw-cpuid"
+version = "11.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -5752,7 +5862,7 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 ]

 [[package]]
@@ -5761,7 +5871,7 @@ version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 "rustls-pki-types",
 ]

@@ -6000,9 +6110,9 @@ dependencies = [

 [[package]]
 name = "seize"
-version = "0.4.9"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
 dependencies = [
 "libc",
 "windows-sys 0.52.0",
@@ -6395,6 +6505,15 @@ version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"

+[[package]]
+name = "spinning_top"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
+dependencies = [
+ "lock_api",
+]
+
 [[package]]
 name = "spki"
 version = "0.6.0"
@@ -6471,6 +6590,7 @@ dependencies = [
 "diesel_migrations",
 "fail",
 "futures",
+ "governor",
 "hex",
 "http-utils",
 "humantime",
@@ -7285,10 +7405,12 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
+ "base64 0.22.1",
 "bitflags 2.8.0",
 "bytes",
 "http 1.1.0",
 "http-body 1.0.0",
+ "mime",
 "pin-project-lite",
 "tower-layer",
 "tower-service",
@@ -7642,7 +7764,6 @@ dependencies = [
 "anyhow",
 "arc-swap",
 "async-compression",
- "backtrace",
 "bincode",
 "byteorder",
 "bytes",
@@ -8196,7 +8317,7 @@ dependencies = [
 "ahash",
 "anyhow",
 "base64 0.13.1",
- "base64 0.21.1",
+ "base64 0.21.7",
 "base64ct",
 "bytes",
 "camino",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-backtrace = "0.3.74"
 flate2 = "1.0.26"
 assert-json-diff = "2"
 async-stream = "0.3"
@@ -68,6 +67,7 @@ aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
 axum = { version = "0.8.1", features = ["ws"] }
+axum-extra = { version = "0.10.0", features = ["typed-header"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.71"
@@ -95,6 +95,7 @@ futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
+governor = "0.8"
 hashbrown = "0.14"
 hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
@@ -113,11 +114,10 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
-inferno = "0.12.0"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
-jemalloc_pprof = "0.6"
+jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
@@ -192,7 +192,7 @@ toml = "0.8"
 toml_edit = "0.22"
 tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
 tower = { version = "0.5.2", default-features = false }
-tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
+tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }

 # This revision uses opentelemetry 0.27. There's no tag for it.
 tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
--- a/7
+++ b/7
@@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu
 #
 BUILD_TYPE ?= debug
 WITH_SANITIZERS ?= no
+PG_CFLAGS = -fsigned-char
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
-	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
-	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
@@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling pg_trgm $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
 	+@echo "Compiling test_decoding $*"
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION:?} postgres
 RUN cd postgres && \
-    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
+    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
    --with-icu --with-libxml --with-libxslt --with-lz4" && \
    if [ "${PG_VERSION:?}" != "v14" ]; then \
        # zstd is available only from PG15
@@ -1933,6 +1933,7 @@ RUN apt update && \
        locales \
        procps \
        ca-certificates \
+        rsyslog \
        $VERSION_INSTALLS && \
    apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
@@ -1978,6 +1979,15 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
 # Make the libraries we built available
 RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig

+# rsyslog config permissions
+RUN chown postgres:postgres /etc/rsyslog.conf && \
+    touch /etc/compute_rsyslog.conf && \
+    chown -R postgres:postgres /etc/compute_rsyslog.conf && \
+    # directory for rsyslogd pid file
+    mkdir /var/run/rsyslogd && \
+    chown -R postgres:postgres /var/run/rsyslogd
+
+
 ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -39,6 +39,10 @@ commands:
    user: nobody
    sysvInitAction: respawn
    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+  - name: rsyslogd
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/sbin/rsyslogd -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
 shutdownHook: |
  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -54,7 +58,7 @@ files:
      # regardless of hostname (ALL)
      #
      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
@@ -69,6 +73,11 @@ files:
          }
          memory {}
      }
+# Create dummy rsyslog config, because it refuses to start without at least one action configured.
+# compute_ctl will rewrite this file with the actual configuration, if needed.
+  - filename: compute_rsyslog.conf
+    content: |
+      *.*    /dev/null
 build: |
  # Build cgroup-tools
  #
@@ -132,6 +141,10 @@ merge: |
  RUN set -e \
      && chmod 0644 /etc/cgconfig.conf

+  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
+  RUN set -e \
+      && chmod 0644 /etc/compute_rsyslog.conf
+
  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -39,6 +39,10 @@ commands:
    user: nobody
    sysvInitAction: respawn
    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+  - name: rsyslogd
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/sbin/rsyslogd -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
 shutdownHook: |
  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -54,7 +58,7 @@ files:
      # regardless of hostname (ALL)
      #
      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
@@ -69,6 +73,11 @@ files:
          }
          memory {}
      }
+# Create dummy rsyslog config, because it refuses to start without at least one action configured.
+# compute_ctl will rewrite this file with the actual configuration, if needed.
+  - filename: compute_rsyslog.conf
+    content: |
+      *.*    /dev/null
 build: |
  # Build cgroup-tools
  #
@@ -128,6 +137,10 @@ merge: |
  RUN set -e \
      && chmod 0644 /etc/cgconfig.conf

+  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
+  RUN set -e \
+      && chmod 0644 /etc/compute_rsyslog.conf
+
  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true
 aws-smithy-types.workspace = true
 anyhow.workspace = true
 axum = { workspace = true, features = [] }
+axum-extra.workspace = true
 camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
@@ -25,6 +26,7 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
+jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -33,39 +33,27 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
-use std::collections::HashMap;
 use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
-use std::str::FromStr;
-use std::sync::atomic::Ordering;
-use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc};
+use std::sync::mpsc;
 use std::thread;
 use std::time::Duration;

 use anyhow::{Context, Result};
-use chrono::Utc;
 use clap::Parser;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
+use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::ComputeSpec;
-use compute_tools::compute::{
-    ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal,
-};
-use compute_tools::configurator::launch_configurator;
-use compute_tools::disk_quota::set_disk_quota;
+use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::server::Server;
 use compute_tools::logger::*;
-use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
-use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
-use compute_tools::swap::resize_swap;
 use rlimit::{Resource, setrlimit};
 use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
 use signal_hook::iterator::Signals;
-use tracing::{error, info, warn};
+use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

@@ -164,29 +152,41 @@ fn main() -> Result<()> {
    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;

-    let (pg_handle, start_pg_result) = {
-        // Enter startup tracing context
-        let _startup_context_guard = startup_context_from_env();
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;

-        let cli_spec = try_spec_from_cli(&cli)?;
+    let cli_spec = try_spec_from_cli(&cli)?;

-        let compute = wait_spec(build_tag, &cli, cli_spec)?;
+    let compute_node = ComputeNode::new(
+        ComputeNodeParams {
+            compute_id: cli.compute_id,
+            connstr,
+            pgdata: cli.pgdata.clone(),
+            pgbin: cli.pgbin.clone(),
+            pgversion: get_pg_version_string(&cli.pgbin),
+            external_http_port: cli.external_http_port,
+            internal_http_port: cli.internal_http_port,
+            ext_remote_storage: cli.remote_ext_config.clone(),
+            resize_swap_on_bind: cli.resize_swap_on_bind,
+            set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
+            #[cfg(target_os = "linux")]
+            filecache_connstr: cli.filecache_connstr,
+            #[cfg(target_os = "linux")]
+            cgroup: cli.cgroup,
+            #[cfg(target_os = "linux")]
+            vm_monitor_addr: cli.vm_monitor_addr,
+            build_tag,

-        start_postgres(&cli, compute)?
+            live_config_allowed: cli_spec.live_config_allowed,
+        },
+        cli_spec.spec,
+        cli_spec.compute_ctl_config,
+    )?;

-        // Startup is finished, exit the startup tracing span
-    };
-
-    // PostgreSQL is now running, if startup was successful. Wait until it exits.
-    let wait_pg_result = wait_postgres(pg_handle)?;
-
-    let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
-
-    maybe_delay_exit(delay_exit);
+    let exit_code = compute_node.run()?;

    scenario.teardown();

-    deinit_and_exit(wait_pg_result);
+    deinit_and_exit(exit_code);
 }

 async fn init() -> Result<String> {
@@ -207,56 +207,6 @@ async fn init() -> Result<String> {
    Ok(build_tag)
 }

-fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
-    // Extract OpenTelemetry context for the startup actions from the
-    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
-    // tracing context.
-    //
-    // This is used to propagate the context for the 'start_compute' operation
-    // from the neon control plane. This allows linking together the wider
-    // 'start_compute' operation that creates the compute container, with the
-    // startup actions here within the container.
-    //
-    // There is no standard for passing context in env variables, but a lot of
-    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
-    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
-    //
-    // Switch to the startup context here, and exit it once the startup has
-    // completed and Postgres is up and running.
-    //
-    // If this pod is pre-created without binding it to any particular endpoint
-    // yet, this isn't the right place to enter the startup context. In that
-    // case, the control plane should pass the tracing context as part of the
-    // /configure API call.
-    //
-    // NOTE: This is supposed to only cover the *startup* actions. Once
-    // postgres is configured and up-and-running, we exit this span. Any other
-    // actions that are performed on incoming HTTP requests, for example, are
-    // performed in separate spans.
-    //
-    // XXX: If the pod is restarted, we perform the startup actions in the same
-    // context as the original startup actions, which probably doesn't make
-    // sense.
-    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
-    if let Ok(val) = std::env::var("TRACEPARENT") {
-        startup_tracing_carrier.insert("traceparent".to_string(), val);
-    }
-    if let Ok(val) = std::env::var("TRACESTATE") {
-        startup_tracing_carrier.insert("tracestate".to_string(), val);
-    }
-    if !startup_tracing_carrier.is_empty() {
-        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry_sdk::propagation::TraceContextPropagator;
-        let guard = TraceContextPropagator::new()
-            .extract(&startup_tracing_carrier)
-            .attach();
-        info!("startup tracing context attached");
-        Some(guard)
-    } else {
-        None
-    }
-}
-
 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
    // First, try to get cluster spec from the cli argument
    if let Some(ref spec_json) = cli.spec_json {
@@ -307,357 +257,7 @@ struct CliSpecParams {
    live_config_allowed: bool,
 }

-fn wait_spec(
-    build_tag: String,
-    cli: &Cli,
-    CliSpecParams {
-        spec,
-        live_config_allowed,
-        compute_ctl_config: _,
-    }: CliSpecParams,
-) -> Result<Arc<ComputeNode>> {
-    let mut new_state = ComputeState::new();
-    let spec_set;
-
-    if let Some(spec) = spec {
-        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
-        info!("new pspec.spec: {:?}", pspec.spec);
-        new_state.pspec = Some(pspec);
-        spec_set = true;
-    } else {
-        spec_set = false;
-    }
-    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
-    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
-        .context("cannot build postgres config from connstr")?;
-    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
-        .context("cannot build tokio postgres config from connstr")?;
-    let compute_node = ComputeNode {
-        compute_id: cli.compute_id.clone(),
-        connstr,
-        conn_conf,
-        tokio_conn_conf,
-        pgdata: cli.pgdata.clone(),
-        pgbin: cli.pgbin.clone(),
-        pgversion: get_pg_version_string(&cli.pgbin),
-        external_http_port: cli.external_http_port,
-        internal_http_port: cli.internal_http_port,
-        live_config_allowed,
-        state: Mutex::new(new_state),
-        state_changed: Condvar::new(),
-        ext_remote_storage: cli.remote_ext_config.clone(),
-        ext_download_progress: RwLock::new(HashMap::new()),
-        build_tag,
-    };
-    let compute = Arc::new(compute_node);
-
-    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have its memory allocated from the host, and
-    // the necessary binaries will already be cached.
-    if !spec_set {
-        compute.prewarm_postgres()?;
-    }
-
-    // Launch the external HTTP server first, so that we can serve control plane
-    // requests while configuration is still in progress.
-    Server::External(cli.external_http_port).launch(&compute);
-
-    // The internal HTTP server could be launched later, but there isn't much
-    // sense in waiting.
-    Server::Internal(cli.internal_http_port).launch(&compute);
-
-    if !spec_set {
-        // No spec provided, hang waiting for it.
-        info!("no compute spec provided, waiting");
-
-        let mut state = compute.state.lock().unwrap();
-        while state.status != ComputeStatus::ConfigurationPending {
-            state = compute.state_changed.wait(state).unwrap();
-
-            if state.status == ComputeStatus::ConfigurationPending {
-                info!("got spec, continue configuration");
-                // Spec is already set by the http server handler.
-                break;
-            }
-        }
-
-        // Record for how long we slept waiting for the spec.
-        let now = Utc::now();
-        state.metrics.wait_for_spec_ms = now
-            .signed_duration_since(state.start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-
-        // Reset start time, so that the total startup time that is calculated later will
-        // not include the time that we waited for the spec.
-        state.start_time = now;
-    }
-
-    launch_lsn_lease_bg_task_for_static(&compute);
-
-    Ok(compute)
-}
-
-fn start_postgres(
-    cli: &Cli,
-    compute: Arc<ComputeNode>,
-) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
-    // We got all we need, update the state.
-    let mut state = compute.state.lock().unwrap();
-
-    // Create a tracing span for the startup operation.
-    //
-    // We could otherwise just annotate the function with #[instrument], but if
-    // we're being configured from a /configure HTTP request, we want the
-    // startup to be considered part of the /configure request.
-    let _this_entered = {
-        // Temporarily enter the /configure request's span, so that the new span
-        // becomes its child.
-        let _parent_entered = state.startup_span.take().map(|p| p.entered());
-
-        tracing::info_span!("start_postgres")
-    }
-    .entered();
-
-    state.set_status(ComputeStatus::Init, &compute.state_changed);
-
-    info!(
-        "running compute with features: {:?}",
-        state.pspec.as_ref().unwrap().spec.features
-    );
-    // before we release the mutex, fetch some parameters for later.
-    let &ComputeSpec {
-        swap_size_bytes,
-        disk_quota_bytes,
-        #[cfg(target_os = "linux")]
-        disable_lfc_resizing,
-        ..
-    } = &state.pspec.as_ref().unwrap().spec;
-    drop(state);
-
-    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute);
-    let _configurator_handle = launch_configurator(&compute);
-
-    let mut prestartup_failed = false;
-    let mut delay_exit = false;
-
-    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
-        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
-        // *before* starting postgres.
-        //
-        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
-        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
-        // OOM-killed during startup because swap wasn't available yet.
-        match resize_swap(size_bytes) {
-            Ok(()) => {
-                let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%size_bytes, %size_mib, "resized swap");
-            }
-            Err(err) => {
-                let err = err.context("failed to resize swap");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                compute.set_failed_status(err);
-                delay_exit = true;
-            }
-        }
-    }
-
-    // Set disk quota if the compute spec says so
-    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
-    {
-        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
-            Ok(()) => {
-                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%disk_quota_bytes, %size_mib, "set disk quota");
-            }
-            Err(err) => {
-                let err = err.context("failed to set disk quota");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                compute.set_failed_status(err);
-                delay_exit = true;
-            }
-        }
-    }
-
-    // Start Postgres
-    let mut pg = None;
-    if !prestartup_failed {
-        pg = match compute.start_compute() {
-            Ok(pg) => {
-                info!(postmaster_pid = %pg.0.id(), "Postgres was started");
-                Some(pg)
-            }
-            Err(err) => {
-                error!("could not start the compute node: {:#}", err);
-                compute.set_failed_status(err);
-                delay_exit = true;
-                None
-            }
-        };
-    } else {
-        warn!("skipping postgres startup because pre-startup step failed");
-    }
-
-    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
-    // because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            use std::env;
-            use tokio_util::sync::CancellationToken;
-
-            // This token is used internally by the monitor to clean up all threads
-            let token = CancellationToken::new();
-
-            // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
-            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
-                None
-            } else {
-                Some(cli.filecache_connstr.clone())
-            };
-
-            let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
-                let vm_monitor = tokio::spawn(vm_monitor::start(
-                    Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: Some(cli.cgroup.clone()),
-                        pgconnstr,
-                        addr: cli.vm_monitor_addr.clone(),
-                    })),
-                    token.clone(),
-                ));
-                Some(vm_monitor)
-            } else {
-                None
-            };
-        }
-    }
-
-    Ok((
-        pg,
-        StartPostgresResult {
-            delay_exit,
-            compute,
-            #[cfg(target_os = "linux")]
-            token,
-            #[cfg(target_os = "linux")]
-            vm_monitor,
-        },
-    ))
-}
-
-type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
-
-struct StartPostgresResult {
-    delay_exit: bool,
-    // passed through from WaitSpecResult
-    compute: Arc<ComputeNode>,
-
-    #[cfg(target_os = "linux")]
-    token: tokio_util::sync::CancellationToken,
-    #[cfg(target_os = "linux")]
-    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
-}
-
-fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
-    // Wait for the child Postgres process forever. In this state Ctrl+C will
-    // propagate to Postgres and it will be shut down as well.
-    let mut exit_code = None;
-    if let Some((mut pg, logs_handle)) = pg {
-        info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
-
-        let ecode = pg
-            .wait()
-            .expect("failed to start waiting on Postgres process");
-        PG_PID.store(0, Ordering::SeqCst);
-
-        // Process has exited. Wait for the log collecting task to finish.
-        let _ = tokio::runtime::Handle::current()
-            .block_on(logs_handle)
-            .map_err(|e| tracing::error!("log task panicked: {:?}", e));
-
-        info!("Postgres exited with code {}, shutting down", ecode);
-        exit_code = ecode.code()
-    }
-
-    Ok(WaitPostgresResult { exit_code })
-}
-
-struct WaitPostgresResult {
-    exit_code: Option<i32>,
-}
-
-fn cleanup_after_postgres_exit(
-    StartPostgresResult {
-        mut delay_exit,
-        compute,
-        #[cfg(target_os = "linux")]
-        vm_monitor,
-        #[cfg(target_os = "linux")]
-        token,
-    }: StartPostgresResult,
-) -> Result<bool> {
-    // Terminate the vm_monitor so it releases the file watcher on
-    // /sys/fs/cgroup/neon-postgres.
-    // Note: the vm-monitor only runs on linux because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            if let Some(handle) = vm_monitor {
-                // Kills all threads spawned by the monitor
-                token.cancel();
-                // Kills the actual task running the monitor
-                handle.abort();
-            }
-        }
-    }
-
-    // Maybe sync safekeepers again, to speed up next startup
-    let compute_state = compute.state.lock().unwrap().clone();
-    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-    if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
-        info!("syncing safekeepers on shutdown");
-        let storage_auth_token = pspec.storage_auth_token.clone();
-        let lsn = compute.sync_safekeepers(storage_auth_token)?;
-        info!("synced safekeepers at lsn {lsn}");
-    }
-
-    let mut state = compute.state.lock().unwrap();
-    if state.status == ComputeStatus::TerminationPending {
-        state.status = ComputeStatus::Terminated;
-        compute.state_changed.notify_all();
-        // we were asked to terminate gracefully, don't exit to avoid restart
-        delay_exit = true
-    }
-    drop(state);
-
-    if let Err(err) = compute.check_for_core_dumps() {
-        error!("error while checking for core dumps: {err:?}");
-    }
-
-    Ok(delay_exit)
-}
-
-fn maybe_delay_exit(delay_exit: bool) {
-    // If launch failed, keep serving HTTP requests for a while, so the cloud
-    // control plane can get the actual error.
-    if delay_exit {
-        info!("giving control plane 30s to collect the error before shutdown");
-        thread::sleep(Duration::from_secs(30));
-    }
-}
-
-fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
+fn deinit_and_exit(exit_code: Option<i32>) -> ! {
    // Shutdown trace pipeline gracefully, so that it has a chance to send any
    // pending traces before we exit. Shutting down OTEL tracing provider may
    // hang for quite some time, see, for example:
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -58,14 +58,14 @@ pub async fn get_database_schema(
    compute: &Arc<ComputeNode>,
    dbname: &str,
 ) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
-    let pgbin = &compute.pgbin;
+    let pgbin = &compute.params.pgbin;
    let basepath = Path::new(pgbin).parent().unwrap();
    let pgdump = basepath.join("pg_dump");

    // Replace the DB in the connection string and disable it to parts.
    // This is the only option to handle DBs with special characters.
-    let conf =
-        postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
+    let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
+        .map_err(|_| SchemaDumpError::Unexpected)?;
    let host = conf
        .get_hosts()
        .first()
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -1,12 +1,16 @@
+use anyhow::Result;
+use std::fmt::Write as FmtWrite;
 use std::fs::{File, OpenOptions};
 use std::io;
+use std::io::Write;
 use std::io::prelude::*;
 use std::path::Path;

-use anyhow::Result;
-use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
+use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};

-use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
+use crate::pg_helpers::{
+    GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
+};

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -55,10 +59,20 @@ pub fn write_postgres_conf(
        writeln!(file, "neon.stripe_size={stripe_size}")?;
    }
    if !spec.safekeeper_connstrings.is_empty() {
+        let mut neon_safekeepers_value = String::new();
+        tracing::info!(
+            "safekeepers_connstrings is not zero, gen: {:?}",
+            spec.safekeepers_generation
+        );
+        // If generation is given, prepend sk list with g#number:
+        if let Some(generation) = spec.safekeepers_generation {
+            write!(neon_safekeepers_value, "g#{}:", generation)?;
+        }
+        neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
        writeln!(
            file,
            "neon.safekeepers={}",
-            escape_conf_value(&spec.safekeeper_connstrings.join(","))
+            escape_conf_value(&neon_safekeepers_value)
        )?;
    }
    if let Some(s) = &spec.tenant_id {
@@ -126,6 +140,54 @@ pub fn write_postgres_conf(
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    // If audit logging is enabled, configure pgaudit.
+    //
+    // Note, that this is called after the settings from spec are written.
+    // This way we always override the settings from the spec
+    // and don't allow the user or the control plane admin to change them.
+    if let ComputeAudit::Hipaa = spec.audit_log_level {
+        writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
+        // This log level is very verbose
+        // but this is necessary for HIPAA compliance.
+        writeln!(file, "pgaudit.log='all'")?;
+        writeln!(file, "pgaudit.log_parameter=on")?;
+        // Disable logging of catalog queries
+        // The catalog doesn't contain sensitive data, so we don't need to audit it.
+        writeln!(file, "pgaudit.log_catalog=off")?;
+        // Set log rotation to 5 minutes
+        // TODO: tune this after performance testing
+        writeln!(file, "pgaudit.log_rotation_age=5")?;
+
+        // Add audit shared_preload_libraries, if they are not present.
+        //
+        // The caller who sets the flag is responsible for ensuring that the necessary
+        // shared_preload_libraries are present in the compute image,
+        // otherwise the compute start will fail.
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            let mut extra_shared_preload_libraries = String::new();
+            if !libs.contains("pgaudit") {
+                extra_shared_preload_libraries.push_str(",pgaudit");
+            }
+            if !libs.contains("pgauditlogtofile") {
+                extra_shared_preload_libraries.push_str(",pgauditlogtofile");
+            }
+            writeln!(
+                file,
+                "shared_preload_libraries='{}{}'",
+                libs, extra_shared_preload_libraries
+            )?;
+        } else {
+            // Typically, this should be unreacheable,
+            // because we always set at least some shared_preload_libraries in the spec
+            // but let's handle it explicitly anyway.
+            writeln!(
+                file,
+                "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
+            )?;
+        }
+        writeln!(file, "# Managed by compute_ctl audit settings: end")?;
+    }
+
    writeln!(file, "neon.extension_server_port={}", extension_server_port)?;

    if spec.drop_subscriptions_before_start {
--- a/compute_tools/src/config_template/compute_rsyslog_template.conf
+++ b/compute_tools/src/config_template/compute_rsyslog_template.conf
@@ -0,0 +1,10 @@
+# Load imfile module to read log files
+module(load="imfile")
+
+# Input configuration for log files in the specified directory
+# Replace {log_directory} with the directory containing the log files
+input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
+global(workDirectory="/var/log")
+
+# Forward logs to remote syslog server
+*.* @@{remote_endpoint}
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -1,7 +1,9 @@
 pub(crate) mod json;
 pub(crate) mod path;
 pub(crate) mod query;
+pub(crate) mod request_id;

 pub(crate) use json::Json;
 pub(crate) use path::Path;
 pub(crate) use query::Query;
+pub(crate) use request_id::RequestId;
--- a/compute_tools/src/http/extract/request_id.rs
+++ b/compute_tools/src/http/extract/request_id.rs
@@ -0,0 +1,86 @@
+use std::{
+    fmt::Display,
+    ops::{Deref, DerefMut},
+};
+
+use axum::{extract::FromRequestParts, response::IntoResponse};
+use http::{StatusCode, request::Parts};
+
+use crate::http::{JsonResponse, headers::X_REQUEST_ID};
+
+/// Extract the request ID from the `X-Request-Id` header.
+#[derive(Debug, Clone, Default)]
+pub(crate) struct RequestId(pub String);
+
+#[derive(Debug)]
+/// Rejection used for [`RequestId`].
+///
+/// Contains one variant for each way the [`RequestId`] extractor can
+/// fail.
+pub(crate) enum RequestIdRejection {
+    /// The request is missing the header.
+    MissingRequestId,
+
+    /// The value of the header is invalid UTF-8.
+    InvalidUtf8,
+}
+
+impl RequestIdRejection {
+    pub fn status(&self) -> StatusCode {
+        match self {
+            RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
+            RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
+        }
+    }
+
+    pub fn message(&self) -> String {
+        match self {
+            RequestIdRejection::MissingRequestId => "request ID is missing",
+            RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
+        }
+        .to_string()
+    }
+}
+
+impl IntoResponse for RequestIdRejection {
+    fn into_response(self) -> axum::response::Response {
+        JsonResponse::error(self.status(), self.message())
+    }
+}
+
+impl<S> FromRequestParts<S> for RequestId
+where
+    S: Send + Sync,
+{
+    type Rejection = RequestIdRejection;
+
+    async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
+        match parts.headers.get(X_REQUEST_ID) {
+            Some(value) => match value.to_str() {
+                Ok(request_id) => Ok(Self(request_id.to_string())),
+                Err(_) => Err(RequestIdRejection::InvalidUtf8),
+            },
+            None => Err(RequestIdRejection::MissingRequestId),
+        }
+    }
+}
+
+impl Deref for RequestId {
+    type Target = String;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for RequestId {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl Display for RequestId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(&self.0)
+    }
+}
--- a/compute_tools/src/http/headers.rs
+++ b/compute_tools/src/http/headers.rs
@@ -0,0 +1,2 @@
+/// Constant for `X-Request-Id` header.
+pub const X_REQUEST_ID: &str = "x-request-id";
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -0,0 +1,145 @@
+use std::{collections::HashSet, net::SocketAddr};
+
+use anyhow::{Result, anyhow};
+use axum::{RequestExt, body::Body, extract::ConnectInfo};
+use axum_extra::{
+    TypedHeader,
+    headers::{Authorization, authorization::Bearer},
+};
+use futures::future::BoxFuture;
+use http::{Request, Response, StatusCode};
+use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
+use serde::Deserialize;
+use tower_http::auth::AsyncAuthorizeRequest;
+use tracing::warn;
+
+use crate::http::{JsonResponse, extract::RequestId};
+
+#[derive(Clone, Debug, Deserialize)]
+pub(in crate::http) struct Claims {
+    compute_id: String,
+}
+
+#[derive(Clone, Debug)]
+pub(in crate::http) struct Authorize {
+    compute_id: String,
+    jwks: JwkSet,
+    validation: Validation,
+}
+
+impl Authorize {
+    pub fn new(compute_id: String, jwks: JwkSet) -> Self {
+        let mut validation = Validation::new(Algorithm::EdDSA);
+        // Nothing is currently required
+        validation.required_spec_claims = HashSet::new();
+        validation.validate_exp = true;
+        // Unused by the control plane
+        validation.validate_aud = false;
+        // Unused by the control plane
+        validation.validate_nbf = false;
+
+        Self {
+            compute_id,
+            jwks,
+            validation,
+        }
+    }
+}
+
+impl AsyncAuthorizeRequest<Body> for Authorize {
+    type RequestBody = Body;
+    type ResponseBody = Body;
+    type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
+
+    fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
+        let compute_id = self.compute_id.clone();
+        let jwks = self.jwks.clone();
+        let validation = self.validation.clone();
+
+        Box::pin(async move {
+            let request_id = request.extract_parts::<RequestId>().await.unwrap();
+
+            // TODO: Remove this check after a successful rollout
+            if jwks.keys.is_empty() {
+                warn!(%request_id, "Authorization has not been configured");
+
+                return Ok(request);
+            }
+
+            let connect_info = request
+                .extract_parts::<ConnectInfo<SocketAddr>>()
+                .await
+                .unwrap();
+
+            // In the event the request is coming from the loopback interface,
+            // allow all requests
+            if connect_info.ip().is_loopback() {
+                warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
+
+                return Ok(request);
+            }
+
+            let TypedHeader(Authorization(bearer)) = request
+                .extract_parts::<TypedHeader<Authorization<Bearer>>>()
+                .await
+                .map_err(|_| {
+                    JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
+                })?;
+
+            let data = match Self::verify(&jwks, bearer.token(), &validation) {
+                Ok(claims) => claims,
+                Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
+            };
+
+            if data.claims.compute_id != compute_id {
+                return Err(JsonResponse::error(
+                    StatusCode::UNAUTHORIZED,
+                    "invalid claims in authorization token",
+                ));
+            }
+
+            // Make claims available to any subsequent middleware or request
+            // handlers
+            request.extensions_mut().insert(data.claims);
+
+            Ok(request)
+        })
+    }
+}
+
+impl Authorize {
+    /// Verify the token using the JSON Web Key set and return the token data.
+    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
+        debug_assert!(!jwks.keys.is_empty());
+
+        for jwk in jwks.keys.iter() {
+            let decoding_key = match DecodingKey::from_jwk(jwk) {
+                Ok(key) => key,
+                Err(e) => {
+                    warn!(
+                        "Failed to construct decoding key from {}: {}",
+                        jwk.common.key_id.as_ref().unwrap(),
+                        e
+                    );
+
+                    continue;
+                }
+            };
+
+            match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
+                Ok(data) => return Ok(data),
+                Err(e) => {
+                    warn!(
+                        "Failed to decode authorization token using {}: {}",
+                        jwk.common.key_id.as_ref().unwrap(),
+                        e
+                    );
+
+                    continue;
+                }
+            }
+        }
+
+        Err(anyhow!("Failed to verify authorization token"))
+    }
+}
--- a/compute_tools/src/http/middleware/mod.rs
+++ b/compute_tools/src/http/middleware/mod.rs
@@ -0,0 +1 @@
+pub(in crate::http) mod authorize;
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -7,6 +7,8 @@ use serde::Serialize;
 use tracing::error;

 mod extract;
+mod headers;
+mod middleware;
 mod routes;
 pub mod server;

--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
    State(compute): State<Arc<ComputeNode>>,
    request: Json<ConfigurationRequest>,
 ) -> Response {
-    if !compute.live_config_allowed {
+    if !compute.params.live_config_allowed {
        return JsonResponse::error(
            StatusCode::PRECONDITION_FAILED,
            "live configuration is not allowed for this compute node".to_string(),
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
 /// Download a remote extension.
 pub(in crate::http) async fn download_extension(
    Path(filename): Path<String>,
-    params: Query<ExtensionServerParams>,
+    ext_server_params: Query<ExtensionServerParams>,
    State(compute): State<Arc<ComputeNode>>,
 ) -> Response {
    // Don't even try to download extensions if no remote storage is configured
-    if compute.ext_remote_storage.is_none() {
+    if compute.params.ext_remote_storage.is_none() {
        return JsonResponse::error(
            StatusCode::PRECONDITION_FAILED,
            "remote storage is not configured",
@@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension(

        remote_extensions.get_ext(
            &filename,
-            params.is_library,
-            &compute.build_tag,
-            &compute.pgversion,
+            ext_server_params.is_library,
+            &compute.params.build_tag,
+            &compute.params.pgversion,
        )
    };

--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -10,48 +10,58 @@ use axum::middleware::{self, Next};
 use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
 use http::StatusCode;
+use jsonwebtoken::jwk::JwkSet;
 use tokio::net::TcpListener;
 use tower::ServiceBuilder;
-use tower_http::request_id::PropagateRequestIdLayer;
-use tower_http::trace::TraceLayer;
-use tracing::{Span, debug, error, info};
+use tower_http::{
+    auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
+};
+use tracing::{Span, error, info};
 use uuid::Uuid;

-use super::routes::{
-    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-    grants, insights, metrics, metrics_json, status, terminate,
+use super::{
+    headers::X_REQUEST_ID,
+    middleware::authorize::Authorize,
+    routes::{
+        check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
+        grants, insights, metrics, metrics_json, status, terminate,
+    },
 };
 use crate::compute::ComputeNode;

-const X_REQUEST_ID: &str = "x-request-id";
-
 /// `compute_ctl` has two servers: internal and external. The internal server
 /// binds to the loopback interface and handles communication from clients on
 /// the compute. The external server is what receives communication from the
 /// control plane, the metrics scraper, etc. We make the distinction because
 /// certain routes in `compute_ctl` only need to be exposed to local processes
 /// like Postgres via the neon extension and local_proxy.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Debug)]
 pub enum Server {
-    Internal(u16),
-    External(u16),
+    Internal {
+        port: u16,
+    },
+    External {
+        port: u16,
+        jwks: JwkSet,
+        compute_id: String,
+    },
 }

 impl Display for Server {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
-            Server::Internal(_) => f.write_str("internal"),
-            Server::External(_) => f.write_str("external"),
+            Server::Internal { .. } => f.write_str("internal"),
+            Server::External { .. } => f.write_str("external"),
        }
    }
 }

-impl From<Server> for Router<Arc<ComputeNode>> {
-    fn from(server: Server) -> Self {
+impl From<&Server> for Router<Arc<ComputeNode>> {
+    fn from(server: &Server) -> Self {
        let mut router = Router::<Arc<ComputeNode>>::new();

        router = match server {
-            Server::Internal(_) => {
+            Server::Internal { .. } => {
                router = router
                    .route(
                        "/extension_server/{*filename}",
@@ -69,59 +79,71 @@ impl From<Server> for Router<Arc<ComputeNode>> {

                router
            }
-            Server::External(_) => router
-                .route("/check_writability", post(check_writability::is_writable))
-                .route("/configure", post(configure::configure))
-                .route("/database_schema", get(database_schema::get_schema_dump))
-                .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
-                .route("/insights", get(insights::get_insights))
-                .route("/metrics", get(metrics::get_metrics))
-                .route("/metrics.json", get(metrics_json::get_metrics))
-                .route("/status", get(status::get_status))
-                .route("/terminate", post(terminate::terminate)),
+            Server::External {
+                jwks, compute_id, ..
+            } => {
+                let unauthenticated_router =
+                    Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
+
+                let authenticated_router = Router::<Arc<ComputeNode>>::new()
+                    .route("/check_writability", post(check_writability::is_writable))
+                    .route("/configure", post(configure::configure))
+                    .route("/database_schema", get(database_schema::get_schema_dump))
+                    .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+                    .route("/insights", get(insights::get_insights))
+                    .route("/metrics.json", get(metrics_json::get_metrics))
+                    .route("/status", get(status::get_status))
+                    .route("/terminate", post(terminate::terminate))
+                    .layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
+                        compute_id.clone(),
+                        jwks.clone(),
+                    )));
+
+                router
+                    .merge(unauthenticated_router)
+                    .merge(authenticated_router)
+            }
        };

-        router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
-            ServiceBuilder::new()
-                // Add this middleware since we assume the request ID exists
-                .layer(middleware::from_fn(maybe_add_request_id_header))
-                .layer(
-                    TraceLayer::new_for_http()
-                        .on_request(|request: &http::Request<_>, _span: &Span| {
-                            let request_id = request
-                                .headers()
-                                .get(X_REQUEST_ID)
-                                .unwrap()
-                                .to_str()
-                                .unwrap();
-
-                            match request.uri().path() {
-                                "/metrics" => {
-                                    debug!(%request_id, "{} {}", request.method(), request.uri())
-                                }
-                                _ => info!(%request_id, "{} {}", request.method(), request.uri()),
-                            };
-                        })
-                        .on_response(
-                            |response: &http::Response<_>, latency: Duration, _span: &Span| {
-                                let request_id = response
+        router
+            .fallback(Server::handle_404)
+            .method_not_allowed_fallback(Server::handle_405)
+            .layer(
+                ServiceBuilder::new()
+                    .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
+                    // Add this middleware since we assume the request ID exists
+                    .layer(middleware::from_fn(maybe_add_request_id_header))
+                    .layer(
+                        TraceLayer::new_for_http()
+                            .on_request(|request: &http::Request<_>, _span: &Span| {
+                                let request_id = request
                                    .headers()
                                    .get(X_REQUEST_ID)
                                    .unwrap()
                                    .to_str()
                                    .unwrap();

-                                info!(
-                                    %request_id,
-                                    code = response.status().as_u16(),
-                                    latency = latency.as_millis()
-                                )
-                            },
-                        ),
-                )
-                .layer(PropagateRequestIdLayer::x_request_id()),
-        )
-            .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
+                                info!(%request_id, "{} {}", request.method(), request.uri());
+                            })
+                            .on_response(
+                                |response: &http::Response<_>, latency: Duration, _span: &Span| {
+                                    let request_id = response
+                                        .headers()
+                                        .get(X_REQUEST_ID)
+                                        .unwrap()
+                                        .to_str()
+                                        .unwrap();
+
+                                    info!(
+                                        %request_id,
+                                        code = response.status().as_u16(),
+                                        latency = latency.as_millis()
+                                    );
+                                },
+                            ),
+                    )
+                    .layer(PropagateRequestIdLayer::x_request_id()),
+            )
    }
 }

@@ -145,15 +167,15 @@ impl Server {
        match self {
            // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
            // allow binding to localhost
-            Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
-            Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
        }
    }

-    fn port(self) -> u16 {
+    fn port(&self) -> u16 {
        match self {
-            Server::Internal(port) => port,
-            Server::External(port) => port,
+            Server::Internal { port, .. } => *port,
+            Server::External { port, .. } => *port,
        }
    }

@@ -180,7 +202,9 @@ impl Server {
            );
        }

-        let router = Router::from(self).with_state(compute);
+        let router = Router::from(&self)
+            .with_state(compute)
+            .into_make_service_with_connect_info::<SocketAddr>();

        if let Err(e) = axum::serve(listener, router).await {
            error!("compute_ctl {} HTTP server error: {}", self, e);
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -21,6 +21,7 @@ mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
+pub mod rsyslog;
 pub mod spec;
 mod spec_apply;
 pub mod swap;
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+use tracing::info;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;

@@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
 pub fn inlinify(s: &str) -> String {
    s.replace('\n', "\u{200B}")
 }
+
+pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
+    // Extract OpenTelemetry context for the startup actions from the
+    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
+    // tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // There is no standard for passing context in env variables, but a lot of
+    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
+    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // If this pod is pre-created without binding it to any particular endpoint
+    // yet, this isn't the right place to enter the startup context. In that
+    // case, the control plane should pass the tracing context as part of the
+    // /configure API call.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    //
+    // XXX: If the pod is restarted, we perform the startup actions in the same
+    // context as the original startup actions, which probably doesn't make
+    // sense.
+    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
+    if let Ok(val) = std::env::var("TRACEPARENT") {
+        startup_tracing_carrier.insert("traceparent".to_string(), val);
+    }
+    if let Ok(val) = std::env::var("TRACESTATE") {
+        startup_tracing_carrier.insert("tracestate".to_string(), val);
+    }
+    if !startup_tracing_carrier.is_empty() {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry_sdk::propagation::TraceContextPropagator;
+        info!("got startup tracing context from env variables");
+        Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
+    } else {
+        None
+    }
+}
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.clone();
+    let connstr = compute.params.connstr.clone();
    let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));

    // During startup and configuration we connect to every Postgres database,
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -0,0 +1,91 @@
+use std::process::Command;
+use std::{fs::OpenOptions, io::Write};
+
+use anyhow::{Context, Result};
+use tracing::info;
+
+fn get_rsyslog_pid() -> Option<String> {
+    let output = Command::new("pgrep")
+        .arg("rsyslogd")
+        .output()
+        .expect("Failed to execute pgrep");
+
+    if !output.stdout.is_empty() {
+        let pid = std::str::from_utf8(&output.stdout)
+            .expect("Invalid UTF-8 in process output")
+            .trim()
+            .to_string();
+        Some(pid)
+    } else {
+        None
+    }
+}
+
+// Start rsyslogd with the specified configuration file
+// If it is already running - restart it.
+// This is necessary, because there is no other way to reload the rsyslog configuration.
+//
+// Rsyslogd shouldn't lose any messages, because of the restart,
+// because it tracks the last read position in the log files
+// and will continue reading from that position.
+// TODO: test it properly
+//
+fn start_rsyslog(rsyslog_conf_path: &str) -> Result<()> {
+    let old_pid = get_rsyslog_pid();
+    if let Some(pid) = old_pid {
+        info!("rsyslogd is already running with pid: {}, restart it", pid);
+        // kill it to restart
+        let _ = Command::new("pkill")
+            .arg("rsyslogd")
+            .output()
+            .context("Failed to stop rsyslogd")?;
+    }
+
+    let _ = Command::new("/usr/sbin/rsyslogd")
+        .arg("-f")
+        .arg(rsyslog_conf_path)
+        .arg("-i")
+        .arg("/var/run/rsyslogd/rsyslogd.pid")
+        .output()
+        .context("Failed to start rsyslogd")?;
+
+    // Check that rsyslogd is running
+    if let Some(pid) = get_rsyslog_pid() {
+        info!("rsyslogd started successfully with pid: {}", pid);
+    } else {
+        return Err(anyhow::anyhow!("Failed to start rsyslogd"));
+    }
+
+    Ok(())
+}
+
+pub fn configure_and_start_rsyslog(
+    log_directory: &str,
+    tag: &str,
+    remote_endpoint: &str,
+) -> Result<()> {
+    let config_content: String = format!(
+        include_str!("config_template/compute_rsyslog_template.conf"),
+        log_directory = log_directory,
+        tag = tag,
+        remote_endpoint = remote_endpoint
+    );
+
+    info!("rsyslog config_content: {}", config_content);
+
+    let rsyslog_conf_path = "/etc/compute_rsyslog.conf";
+    let mut file = OpenOptions::new()
+        .create(true)
+        .write(true)
+        .truncate(true)
+        .open(rsyslog_conf_path)?;
+
+    file.write_all(config_content.as_bytes())?;
+
+    info!("rsyslog configuration added successfully. Starting rsyslogd");
+
+    // start the service, using the configuration
+    start_rsyslog(rsyslog_conf_path)?;
+
+    Ok(())
+}
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -6,7 +6,7 @@ use std::sync::Arc;

 use anyhow::{Context, Result};
 use compute_api::responses::ComputeStatus;
-use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
+use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
 use futures::future::join_all;
 use tokio::sync::RwLock;
 use tokio_postgres::Client;
@@ -19,10 +19,10 @@ use crate::pg_helpers::{
    get_existing_roles_async,
 };
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
-    CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
-    HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
-    RunInEachDatabase,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
+    CreatePgauditlogtofileExtension, CreateSchemaNeon, CreateSuperUser, DisablePostgresDBPgAudit,
+    DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension,
+    HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
 };
 use crate::spec_apply::PerDatabasePhase::{
    ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
@@ -277,6 +277,19 @@ impl ComputeNode {
                phases.push(FinalizeDropLogicalSubscriptions);
            }

+            // Keep DisablePostgresDBPgAudit phase at the end,
+            // so that all config operations are audit logged.
+            match spec.audit_log_level
+            {
+                ComputeAudit::Hipaa => {
+                    phases.push(CreatePgauditExtension);
+                    phases.push(CreatePgauditlogtofileExtension);
+                    phases.push(DisablePostgresDBPgAudit);
+                }
+                ComputeAudit::Log => { /* not implemented yet */ }
+                ComputeAudit::Disabled => {}
+            }
+
            for phase in phases {
                debug!("Applying phase {:?}", &phase);
                apply_operations(
@@ -463,6 +476,9 @@ pub enum ApplySpecPhase {
    CreateAndAlterDatabases,
    CreateSchemaNeon,
    RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
+    CreatePgauditExtension,
+    CreatePgauditlogtofileExtension,
+    DisablePostgresDBPgAudit,
    HandleOtherExtensions,
    HandleNeonExtension,
    CreateAvailabilityCheck,
@@ -1098,6 +1114,25 @@ async fn get_operations<'a>(
            }
            Ok(Box::new(empty()))
        }
+        ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
+            query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
+            comment: Some(String::from("create pgaudit extensions")),
+        }))),
+        ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
+            query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
+            comment: Some(String::from("create pgauditlogtofile extensions")),
+        }))),
+        // Disable pgaudit logging for postgres database.
+        // Postgres is neon system database used by monitors
+        // and compute_ctl tuning functions and thus generates a lot of noise.
+        // We do not consider data stored in this database as sensitive.
+        ApplySpecPhase::DisablePostgresDBPgAudit => {
+            let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
+            Ok(Box::new(once(Operation {
+                query: query.to_string(),
+                comment: Some(query.to_string()),
+            })))
+        }
        ApplySpecPhase::HandleNeonExtension => {
            let operations = vec![
                Operation {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -40,6 +40,7 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
+use safekeeper_api::membership::SafekeeperGeneration;
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -596,7 +597,15 @@ struct EndpointStartCmdArgs {
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,

-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
+    )]
+    safekeepers_generation: Option<u32>,
+    #[clap(
+        long,
+        help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
+    )]
    safekeepers: Option<String>,

    #[clap(
@@ -617,9 +626,9 @@ struct EndpointStartCmdArgs {
    )]
    allow_multiple: bool,

-    #[clap(short = 't', long, help = "timeout until we fail the command")]
-    #[arg(default_value = "10s")]
-    start_timeout: humantime::Duration,
+    #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
+    #[arg(default_value = "90s")]
+    start_timeout: Duration,
 }

 #[derive(clap::Args)]
@@ -1350,6 +1359,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
            let pageserver_id = args.endpoint_pageserver_id;
            let remote_ext_config = &args.remote_ext_config;

+            let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
            // If --safekeepers argument is given, use only the listed
            // safekeeper nodes; otherwise all from the env.
            let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
@@ -1425,11 +1435,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
            endpoint
                .start(
                    &auth_token,
+                    safekeepers_generation,
                    safekeepers,
                    pageservers,
                    remote_ext_config.as_ref(),
                    stripe_size.0 as usize,
                    args.create_test_user,
+                    args.start_timeout,
                )
                .await?;
        }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -42,17 +42,19 @@ use std::path::PathBuf;
 use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};

 use anyhow::{Context, Result, anyhow, bail};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
 use compute_api::spec::{
-    Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
+    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
+    RemoteExtSpec, Role,
 };
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use reqwest::header::CONTENT_TYPE;
+use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use tracing::debug;
 use url::Host;
@@ -576,14 +578,17 @@ impl Endpoint {
        Ok(safekeeper_connstrings)
    }

+    #[allow(clippy::too_many_arguments)]
    pub async fn start(
        &self,
        auth_token: &Option<String>,
+        safekeepers_generation: Option<SafekeeperGeneration>,
        safekeepers: Vec<NodeId>,
        pageservers: Vec<(Host, u16)>,
        remote_ext_config: Option<&String>,
        shard_stripe_size: usize,
        create_test_user: bool,
+        start_timeout: Duration,
    ) -> Result<()> {
        if self.status() == EndpointStatus::Running {
            anyhow::bail!("The endpoint is already running");
@@ -655,6 +660,7 @@ impl Endpoint {
            timeline_id: Some(self.timeline_id),
            mode: self.mode,
            pageserver_connstring: Some(pageserver_connstring),
+            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
            remote_extensions,
@@ -663,6 +669,7 @@ impl Endpoint {
            local_proxy_config: None,
            reconfigure_concurrency: self.reconfigure_concurrency,
            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+            audit_log_level: ComputeAudit::Disabled,
        };

        // this strange code is needed to support respec() in tests
@@ -770,17 +777,18 @@ impl Endpoint {
        std::fs::write(pidfile_path, pid.to_string())?;

        // Wait for it to start
-        let mut attempt = 0;
        const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
-        const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
+        let start_at = Instant::now();
        loop {
-            attempt += 1;
            match self.get_status().await {
                Ok(state) => {
                    match state.status {
                        ComputeStatus::Init => {
-                            if attempt == MAX_ATTEMPTS {
-                                bail!("compute startup timed out; still in Init state");
+                            if Instant::now().duration_since(start_at) > start_timeout {
+                                bail!(
+                                    "compute startup timed out {:?}; still in Init state",
+                                    start_timeout
+                                );
                            }
                            // keep retrying
                        }
@@ -807,8 +815,11 @@ impl Endpoint {
                    }
                }
                Err(e) => {
-                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                    if Instant::now().duration_since(start_at) > start_timeout {
+                        return Err(e).context(format!(
+                            "timed out {:?} waiting to connect to compute_ctl HTTP",
+                            start_timeout,
+                        ));
                    }
                }
            }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -165,6 +165,8 @@ pub struct NeonStorageControllerConf {

    #[serde(with = "humantime_serde")]
    pub long_reconcile_threshold: Option<Duration>,
+
+    pub load_safekeepers: bool,
 }

 impl NeonStorageControllerConf {
@@ -188,6 +190,7 @@ impl Default for NeonStorageControllerConf {
            max_secondary_lag_bytes: None,
            heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
            long_reconcile_threshold: None,
+            load_safekeepers: true,
        }
    }
 }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -537,6 +537,10 @@ impl StorageController {
            args.push("--start-as-candidate".to_string());
        }

+        if self.config.load_safekeepers {
+            args.push("--load-safekeepers".to_string());
+        }
+
        if let Some(private_key) = &self.private_key {
            let claims = Claims::new(None, Scope::PageServerApi);
            let jwt_token =
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -6,8 +6,11 @@ generate_id() {
    local -n resvar=$1
    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
 }
-if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then
-  echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined
+echo "${OLD_COMPUTE_TAG}"
+echo "${NEW_COMPUTE_TAG}"
+echo "${TEST_EXTENSIONS_TAG}"
+if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
+  echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
  exit 1
 fi
 export PG_VERSION=${PG_VERSION:-16}
@@ -82,7 +85,7 @@ EXTENSIONS='[
 {"extname": "pg_repack", "extdir": "pg_repack-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
-COMPUTE_TAG=${NEW_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
+COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
 wait_for_ready
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
@@ -90,7 +93,7 @@ create_extensions "${EXTNAMES}"
 query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
 new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
 docker compose --profile test-extensions down
-COMPUTE_TAG=${OLD_COMPUTE_TAG} TEST_EXTENSIONS_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
+COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
 wait_for_ready
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -134,8 +134,10 @@ pub struct CatalogObjects {
    pub databases: Vec<Database>,
 }

-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct ComputeCtlConfig {
+    /// Set of JSON web keys that the compute can use to authenticate
+    /// communication from the control plane.
    pub jwks: JwkSet,
 }

--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -101,6 +101,17 @@ pub struct ComputeSpec {
    pub timeline_id: Option<TimelineId>,
    pub pageserver_connstring: Option<String>,

+    /// Safekeeper membership config generation. It is put in
+    /// neon.safekeepers GUC and serves two purposes:
+    /// 1) Non zero value forces walproposer to use membership configurations.
+    /// 2) If walproposer wants to update list of safekeepers to connect to
+    ///    taking them from some safekeeper mconf, it should check what value
+    ///    is newer by comparing the generation.
+    ///
+    /// Note: it could be SafekeeperGeneration, but this needs linking
+    /// compute_ctl with postgres_ffi.
+    #[serde(default)]
+    pub safekeepers_generation: Option<u32>,
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,

@@ -144,6 +155,16 @@ pub struct ComputeSpec {
    /// over the same replication content from publisher.
    #[serde(default)] // Default false
    pub drop_subscriptions_before_start: bool,
+
+    /// Log level for audit logging:
+    ///
+    /// Disabled - no audit logging. This is the default.
+    /// log - log masked statements to the postgres log using pgaudit extension
+    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
+    ///
+    /// Extensions should be present in shared_preload_libraries
+    #[serde(default)]
+    pub audit_log_level: ComputeAudit,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -251,6 +272,17 @@ pub enum ComputeMode {
    Replica,
 }

+/// Log level for audit logging
+/// Disabled, log, hipaa
+/// Default is Disabled
+#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
+pub enum ComputeAudit {
+    #[default]
+    Disabled,
+    Log,
+    Hipaa,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Cluster {
    pub cluster_id: Option<String>,
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -6,11 +6,8 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
-backtrace.workspace = true
 bytes.workspace = true
-inferno.workspace = true
 fail.workspace = true
-flate2.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
 jemalloc_pprof.workspace = true
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -3,8 +3,6 @@ use std::io::Write as _;
 use std::str::FromStr;
 use std::time::Duration;

-use ::pprof::ProfilerGuardBuilder;
-use ::pprof::protos::Message as _;
 use anyhow::{Context, anyhow};
 use bytes::{Bytes, BytesMut};
 use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
@@ -12,7 +10,8 @@ use hyper::http::HeaderValue;
 use hyper::{Body, Method, Request, Response};
 use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
 use once_cell::sync::Lazy;
-use regex::Regex;
+use pprof::ProfilerGuardBuilder;
+use pprof::protos::Message as _;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tokio::sync::{Mutex, Notify, mpsc};
@@ -22,7 +21,6 @@ use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};

 use crate::error::{ApiError, api_error_handler, route_error_handler};
-use crate::pprof;
 use crate::request::{get_query_param, parse_query_param};

 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -449,20 +447,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
    };

-    // Functions and mappings to strip when symbolizing pprof profiles. If true,
-    // also remove child frames.
-    static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
-        vec![
-            (Regex::new("^__rust").unwrap(), false),
-            (Regex::new("^_start$").unwrap(), false),
-            (Regex::new("^irallocx_prof").unwrap(), true),
-            (Regex::new("^prof_alloc_prep").unwrap(), true),
-            (Regex::new("^std::rt::lang_start").unwrap(), false),
-            (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
-        ]
-    });
-    const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
-
    // Obtain profiler handle.
    let mut prof_ctl = jemalloc_pprof::PROF_CTL
        .as_ref()
@@ -495,45 +479,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
        }

        Format::Pprof => {
-            let data = tokio::task::spawn_blocking(move || {
-                let bytes = prof_ctl.dump_pprof()?;
-                // Symbolize the profile.
-                // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
-                // serialization roundtrip.
-                let profile = pprof::decode(&bytes)?;
-                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
-                pprof::encode(&profile)
-            })
-            .await
-            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-            .map_err(ApiError::InternalServerError)?;
+            let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
            Response::builder()
                .status(200)
                .header(CONTENT_TYPE, "application/octet-stream")
-                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"")
                .body(Body::from(data))
                .map_err(|err| ApiError::InternalServerError(err.into()))
        }

        Format::Svg => {
-            let body = tokio::task::spawn_blocking(move || {
-                let bytes = prof_ctl.dump_pprof()?;
-                let profile = pprof::decode(&bytes)?;
-                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
-                let mut opts = inferno::flamegraph::Options::default();
-                opts.title = "Heap inuse".to_string();
-                opts.count_name = "bytes".to_string();
-                pprof::flamegraph(profile, &mut opts)
-            })
-            .await
-            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-            .map_err(ApiError::InternalServerError)?;
+            let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
            Response::builder()
                .status(200)
                .header(CONTENT_TYPE, "image/svg+xml")
-                .body(Body::from(body))
+                .body(Body::from(svg))
                .map_err(|err| ApiError::InternalServerError(err.into()))
        }
    }
--- a/libs/http-utils/src/lib.rs
+++ b/libs/http-utils/src/lib.rs
@@ -2,7 +2,6 @@ pub mod endpoint;
 pub mod error;
 pub mod failpoints;
 pub mod json;
-pub mod pprof;
 pub mod request;

 extern crate hyper0 as hyper;
--- a/libs/http-utils/src/pprof.rs
+++ b/libs/http-utils/src/pprof.rs
@@ -1,238 +0,0 @@
-use std::borrow::Cow;
-use std::collections::{HashMap, HashSet};
-use std::ffi::c_void;
-use std::io::Write as _;
-
-use anyhow::bail;
-use flate2::Compression;
-use flate2::write::{GzDecoder, GzEncoder};
-use itertools::Itertools as _;
-use pprof::protos::{Function, Line, Location, Message as _, Profile};
-use regex::Regex;
-
-/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
-pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
-    let mut gz = GzDecoder::new(Vec::new());
-    gz.write_all(bytes)?;
-    Ok(Profile::parse_from_bytes(&gz.finish()?)?)
-}
-
-/// Encodes a pprof profile as gzip-compressed Protobuf.
-pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
-    let mut gz = GzEncoder::new(Vec::new(), Compression::default());
-    profile.write_to_writer(&mut gz)?;
-    Ok(gz.finish()?)
-}
-
-/// Symbolizes a pprof profile using the current binary.
-pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
-    if !profile.function.is_empty() {
-        return Ok(profile); // already symbolized
-    }
-
-    // Collect function names.
-    let mut functions: HashMap<String, Function> = HashMap::new();
-    let mut strings: HashMap<String, i64> = profile
-        .string_table
-        .into_iter()
-        .enumerate()
-        .map(|(i, s)| (s, i as i64))
-        .collect();
-
-    // Helper to look up or register a string.
-    let mut string_id = |s: &str| -> i64 {
-        // Don't use .entry() to avoid unnecessary allocations.
-        if let Some(id) = strings.get(s) {
-            return *id;
-        }
-        let id = strings.len() as i64;
-        strings.insert(s.to_string(), id);
-        id
-    };
-
-    for loc in &mut profile.location {
-        if !loc.line.is_empty() {
-            continue;
-        }
-
-        // Resolve the line and function for each location.
-        backtrace::resolve(loc.address as *mut c_void, |symbol| {
-            let Some(symbol_name) = symbol.name() else {
-                return;
-            };
-
-            let function_name = format!("{symbol_name:#}");
-            let functions_len = functions.len();
-            let function_id = functions
-                .entry(function_name)
-                .or_insert_with_key(|function_name| {
-                    let function_id = functions_len as u64 + 1;
-                    let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
-                    let filename = symbol
-                        .filename()
-                        .map(|path| path.to_string_lossy())
-                        .unwrap_or(Cow::Borrowed(""));
-                    Function {
-                        id: function_id,
-                        name: string_id(function_name),
-                        system_name: string_id(&system_name),
-                        filename: string_id(&filename),
-                        ..Default::default()
-                    }
-                })
-                .id;
-            loc.line.push(Line {
-                function_id,
-                line: symbol.lineno().unwrap_or(0) as i64,
-                ..Default::default()
-            });
-        });
-    }
-
-    // Store the resolved functions, and mark the mapping as resolved.
-    profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
-    profile.string_table = strings
-        .into_iter()
-        .sorted_by_key(|(_, i)| *i)
-        .map(|(s, _)| s)
-        .collect();
-
-    for mapping in &mut profile.mapping {
-        mapping.has_functions = true;
-        mapping.has_filenames = true;
-    }
-
-    Ok(profile)
-}
-
-/// Strips locations (stack frames) matching the given mappings (substring) or function names
-/// (regex). The function bool specifies whether child frames should be stripped as well.
-///
-/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
-/// string references.
-pub fn strip_locations(
-    mut profile: Profile,
-    mappings: &[&str],
-    functions: &[(Regex, bool)],
-) -> Profile {
-    // Strip mappings.
-    let mut strip_mappings: HashSet<u64> = HashSet::new();
-
-    profile.mapping.retain(|mapping| {
-        let Some(name) = profile.string_table.get(mapping.filename as usize) else {
-            return true;
-        };
-        if mappings.iter().any(|substr| name.contains(substr)) {
-            strip_mappings.insert(mapping.id);
-            return false;
-        }
-        true
-    });
-
-    // Strip functions.
-    let mut strip_functions: HashMap<u64, bool> = HashMap::new();
-
-    profile.function.retain(|function| {
-        let Some(name) = profile.string_table.get(function.name as usize) else {
-            return true;
-        };
-        for (regex, strip_children) in functions {
-            if regex.is_match(name) {
-                strip_functions.insert(function.id, *strip_children);
-                return false;
-            }
-        }
-        true
-    });
-
-    // Strip locations. The bool specifies whether child frames should be stripped too.
-    let mut strip_locations: HashMap<u64, bool> = HashMap::new();
-
-    profile.location.retain(|location| {
-        for line in &location.line {
-            if let Some(strip_children) = strip_functions.get(&line.function_id) {
-                strip_locations.insert(location.id, *strip_children);
-                return false;
-            }
-        }
-        if strip_mappings.contains(&location.mapping_id) {
-            strip_locations.insert(location.id, false);
-            return false;
-        }
-        true
-    });
-
-    // Strip sample locations.
-    for sample in &mut profile.sample {
-        // First, find the uppermost function with child removal and truncate the stack.
-        if let Some(truncate) = sample
-            .location_id
-            .iter()
-            .rposition(|id| strip_locations.get(id) == Some(&true))
-        {
-            sample.location_id.drain(..=truncate);
-        }
-        // Next, strip any individual frames without child removal.
-        sample
-            .location_id
-            .retain(|id| !strip_locations.contains_key(id));
-    }
-
-    profile
-}
-
-/// Generates an SVG flamegraph from a symbolized pprof profile.
-pub fn flamegraph(
-    profile: Profile,
-    opts: &mut inferno::flamegraph::Options,
-) -> anyhow::Result<Vec<u8>> {
-    if profile.mapping.iter().any(|m| !m.has_functions) {
-        bail!("profile not symbolized");
-    }
-
-    // Index locations, functions, and strings.
-    let locations: HashMap<u64, Location> =
-        profile.location.into_iter().map(|l| (l.id, l)).collect();
-    let functions: HashMap<u64, Function> =
-        profile.function.into_iter().map(|f| (f.id, f)).collect();
-    let strings = profile.string_table;
-
-    // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
-    // since inferno expects it bottom-up.
-    let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
-    for sample in profile.sample {
-        let mut stack = Vec::with_capacity(sample.location_id.len());
-        for location in sample.location_id.into_iter().rev() {
-            let Some(location) = locations.get(&location) else {
-                bail!("missing location {location}");
-            };
-            for line in location.line.iter().rev() {
-                let Some(function) = functions.get(&line.function_id) else {
-                    bail!("missing function {}", line.function_id);
-                };
-                let Some(name) = strings.get(function.name as usize) else {
-                    bail!("missing string {}", function.name);
-                };
-                stack.push(name.as_str());
-            }
-        }
-        let Some(&value) = sample.value.first() else {
-            bail!("missing value");
-        };
-        *stacks.entry(stack).or_default() += value;
-    }
-
-    // Construct stack lines for inferno.
-    let lines = stacks
-        .into_iter()
-        .map(|(stack, value)| (stack.into_iter().join(";"), value))
-        .map(|(stack, value)| format!("{stack} {value}"))
-        .sorted()
-        .collect_vec();
-
-    // Construct the flamegraph.
-    let mut bytes = Vec::new();
-    let lines = lines.iter().map(|line| line.as_str());
-    inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
-    Ok(bytes)
-}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1165,6 +1165,21 @@ pub struct OffloadedTimelineInfo {
    pub archived_at: chrono::DateTime<chrono::Utc>,
 }

+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum RelSizeMigration {
+    /// The tenant is using the old rel_size format.
+    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
+    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
+    Legacy,
+    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
+    /// persisted in the index part. The read path will read both formats and merge them.
+    Migrating,
+    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
+    /// in the index part, and the read path will not read the old format.
+    Migrated,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -1243,7 +1258,11 @@ pub struct TimelineInfo {
    // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
    // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
    // read.
+    /// Whether the timeline is archived.
    pub is_archived: Option<bool>,
+
+    /// The status of the rel_size migration.
+    pub rel_size_migration: Option<RelSizeMigration>,
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
--- a/libs/proxy/tokio-postgres2/src/cancel_query.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -34,8 +34,13 @@ where
        .make_tls_connect(hostname)
        .map_err(|e| Error::tls(e.into()))?;

-    let socket =
-        connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
+    let socket = connect_socket::connect_socket(
+        config.host_addr,
+        &config.host,
+        config.port,
+        config.connect_timeout,
+    )
+    .await?;

    cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
 }
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::fmt;
+use std::net::IpAddr;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::time::Duration;
@@ -137,6 +138,7 @@ impl InnerClient {

 #[derive(Clone, Serialize, Deserialize)]
 pub struct SocketConfig {
+    pub host_addr: Option<IpAddr>,
    pub host: Host,
    pub port: u16,
    pub connect_timeout: Option<Duration>,
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -1,5 +1,6 @@
 //! Connection configuration.

+use std::net::IpAddr;
 use std::time::Duration;
 use std::{fmt, str};

@@ -65,6 +66,7 @@ pub enum AuthKeys {
 /// Connection configuration.
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
+    pub(crate) host_addr: Option<IpAddr>,
    pub(crate) host: Host,
    pub(crate) port: u16,

@@ -83,6 +85,7 @@ impl Config {
    /// Creates a new configuration.
    pub fn new(host: String, port: u16) -> Config {
        Config {
+            host_addr: None,
            host: Host::Tcp(host),
            port,
            password: None,
@@ -163,6 +166,15 @@ impl Config {
        self
    }

+    pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config {
+        self.host_addr = Some(addr);
+        self
+    }
+
+    pub fn get_host_addr(&self) -> Option<IpAddr> {
+        self.host_addr
+    }
+
    /// Sets the SSL configuration.
    ///
    /// Defaults to `prefer`.
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,3 +1,5 @@
+use std::net::IpAddr;
+
 use postgres_protocol2::message::backend::Message;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;
@@ -25,13 +27,14 @@ where
        .make_tls_connect(hostname)
        .map_err(|e| Error::tls(e.into()))?;

-    match connect_once(&config.host, config.port, tls, config).await {
+    match connect_once(config.host_addr, &config.host, config.port, tls, config).await {
        Ok((client, connection)) => Ok((client, connection)),
        Err(e) => Err(e),
    }
 }

 async fn connect_once<T>(
+    host_addr: Option<IpAddr>,
    host: &Host,
    port: u16,
    tls: T,
@@ -40,7 +43,7 @@ async fn connect_once<T>(
 where
    T: TlsConnect<TcpStream>,
 {
-    let socket = connect_socket(host, port, config.connect_timeout).await?;
+    let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
    let RawConnection {
        stream,
        parameters,
@@ -50,6 +53,7 @@ where
    } = connect_raw(socket, tls, config).await?;

    let socket_config = SocketConfig {
+        host_addr,
        host: host.clone(),
        port,
        connect_timeout: config.connect_timeout,
--- a/libs/proxy/tokio-postgres2/src/connect_socket.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs
@@ -1,5 +1,6 @@
 use std::future::Future;
 use std::io;
+use std::net::{IpAddr, SocketAddr};
 use std::time::Duration;

 use tokio::net::{self, TcpStream};
@@ -9,15 +10,20 @@ use crate::Error;
 use crate::config::Host;

 pub(crate) async fn connect_socket(
+    host_addr: Option<IpAddr>,
    host: &Host,
    port: u16,
    connect_timeout: Option<Duration>,
 ) -> Result<TcpStream, Error> {
    match host {
        Host::Tcp(host) => {
-            let addrs = net::lookup_host((&**host, port))
-                .await
-                .map_err(Error::connect)?;
+            let addrs = match host_addr {
+                Some(addr) => vec![SocketAddr::new(addr, port)],
+                None => net::lookup_host((&**host, port))
+                    .await
+                    .map_err(Error::connect)?
+                    .collect(),
+            };

            let mut last_err = None;

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -15,7 +15,6 @@ arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
 anyhow.workspace = true
-backtrace.workspace = true
 bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
--- a/libs/utils/src/sentry_init.rs
+++ b/libs/utils/src/sentry_init.rs
@@ -3,20 +3,24 @@ use std::env;

 use sentry::ClientInitGuard;
 pub use sentry::release_name;
+use tracing::{error, info};

 #[must_use]
 pub fn init_sentry(
    release_name: Option<Cow<'static, str>>,
    extra_options: &[(&str, &str)],
 ) -> Option<ClientInitGuard> {
-    let dsn = env::var("SENTRY_DSN").ok()?;
+    let Ok(dsn) = env::var("SENTRY_DSN") else {
+        info!("not initializing Sentry, no SENTRY_DSN given");
+        return None;
+    };
    let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());

    let guard = sentry::init((
        dsn,
        sentry::ClientOptions {
-            release: release_name,
-            environment: Some(environment.into()),
+            release: release_name.clone(),
+            environment: Some(environment.clone().into()),
            ..Default::default()
        },
    ));
@@ -25,5 +29,19 @@ pub fn init_sentry(
            scope.set_extra(key, value.into());
        }
    });
+
+    if let Some(dsn) = guard.dsn() {
+        info!(
+            "initialized Sentry for project {}, environment {}, release {} (using API {})",
+            dsn.project_id(),
+            environment,
+            release_name.unwrap_or(Cow::Borrowed("None")),
+            dsn.envelope_api_url(),
+        );
+    } else {
+        // This should panic during sentry::init(), but we may as well cover it.
+        error!("failed to initialize Sentry, invalid DSN");
+    }
+
    Some(guard)
 }
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -7,7 +7,6 @@ use std::time::Instant;

 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
-use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
 use pageserver_api::key::Key;
@@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
        .collect()
 }

-// Construct a partitioning for testing get_difficulty map when we
-// don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
-    let mut parts = Vec::new();
-
-    // We add a partition boundary at the start of each image layer,
-    // no matter what lsn range it covers. This is just the easiest
-    // thing to do. A better thing to do would be to get a real
-    // partitioning from some database. Even better, remove the need
-    // for key partitions by deciding where to create image layers
-    // directly based on a coverage-based difficulty map.
-    let mut keys: Vec<_> = layer_map
-        .iter_historic_layers()
-        .filter_map(|l| {
-            if l.is_incremental() {
-                None
-            } else {
-                let kr = l.get_key_range();
-                Some(kr.start.next())
-            }
-        })
-        .collect();
-    keys.sort();
-
-    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
-    for key in keys {
-        parts.push(KeySpace {
-            ranges: vec![current_key..key],
-        });
-        current_key = key;
-    }
-
-    KeyPartitioning { parts }
-}
-
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) {
    // Choose uniformly distributed queries
    let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);

-    // Choose inputs for get_difficulty_map
-    let latest_lsn = layer_map
-        .iter_historic_layers()
-        .map(|l| l.get_lsn_range().end)
-        .max()
-        .unwrap();
-    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
-
-    // Check correctness of get_difficulty_map
-    // TODO put this in a dedicated test outside of this mod
-    {
-        println!("running correctness check");
-
-        let now = Instant::now();
-        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
-        assert!(result_bruteforce.len() == partitioning.parts.len());
-        println!("Finished bruteforce in {:?}", now.elapsed());
-
-        let now = Instant::now();
-        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
-        assert!(result_fast.len() == partitioning.parts.len());
-        println!("Finished fast in {:?}", now.elapsed());
-
-        // Assert results are equal. Manually iterate for easier debugging.
-        let zip = std::iter::zip(
-            &partitioning.parts,
-            std::iter::zip(result_bruteforce, result_fast),
-        );
-        for (_part, (bruteforce, fast)) in zip {
-            assert_eq!(bruteforce, fast);
-        }
-
-        println!("No issues found");
-    }
-
    // Define and name the benchmark function
    let mut group = c.benchmark_group("real_map");
    group.bench_function("uniform_queries", |b| {
@@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) {
            }
        });
    });
-    group.bench_function("get_difficulty_map", |b| {
-        b.iter(|| {
-            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
-        });
-    });
    group.finish();
 }

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -480,6 +480,7 @@ impl Client {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        concurrency: Option<usize>,
+        recurse: bool,
    ) -> Result<()> {
        let mut path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
@@ -487,6 +488,9 @@ impl Client {
        ))
        .expect("Cannot build URL");

+        path.query_pairs_mut()
+            .append_pair("recurse", &format!("{}", recurse));
+
        if let Some(concurrency) = concurrency {
            path.query_pairs_mut()
                .append_pair("concurrency", &format!("{}", concurrency));
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,8 +33,9 @@ use utils::lsn::Lsn;

 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
-use crate::tenant::Timeline;
 use crate::tenant::storage_layer::IoConcurrency;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};

 #[derive(Debug, thiserror::Error)]
 pub enum BasebackupError {
@@ -42,6 +43,26 @@ pub enum BasebackupError {
    Server(#[from] anyhow::Error),
    #[error("basebackup client error {0:#} when {1}")]
    Client(#[source] io::Error, &'static str),
+    #[error("basebackup during shutdown")]
+    Shutdown,
+}
+
+impl From<PageReconstructError> for BasebackupError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
+}
+
+impl From<GetVectoredError> for BasebackupError {
+    fn from(value: GetVectoredError) -> Self {
+        match value {
+            GetVectoredError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
 }

 /// Create basebackup with non-rel data in it.
@@ -127,7 +148,7 @@ where
            timeline
                .gate
                .enter()
-                .map_err(|e| BasebackupError::Server(e.into()))?,
+                .map_err(|_| BasebackupError::Shutdown)?,
        ),
    };
    basebackup
@@ -323,8 +344,7 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
+                .await?
                .partition(
                    self.timeline.get_shard_identity(),
                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -336,11 +356,10 @@ where
                let blocks = self
                    .timeline
                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;

                for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    let block = block?;
                    slru_builder.add_block(&key, block).await?;
                }
            }
@@ -349,11 +368,8 @@ where

        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;

@@ -362,8 +378,7 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -391,8 +406,7 @@ where
        let aux_files = self
            .timeline
            .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
        let aux_scan_time = start_time.elapsed();
        let aux_estimated_size = aux_files
            .values()
@@ -451,16 +465,14 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
        {
            self.add_twophase_file(xid).await?;
        }
        let repl_origins = self
            .timeline
            .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
        let n_origins = repl_origins.len();
        if n_origins != 0 {
            //
@@ -505,8 +517,7 @@ where
        let nblocks = self
            .timeline
            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        // If the relation is empty, create an empty file
        if nblocks == 0 {
@@ -532,8 +543,7 @@ where
                    // TODO: investigate using get_vectored for the entire startblk..endblk range.
                    // But this code path is not on the critical path for most basebackups (?).
                    .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }

@@ -567,8 +577,7 @@ where
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;

            if img.len()
                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -622,8 +631,7 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                    .is_empty()
            {
                return Ok(());
@@ -674,8 +682,7 @@ where
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -842,6 +842,12 @@ paths:
        required: false
        schema:
          type: integer
+      - name: recurse
+        description: When set, will recurse with the downloads into ancestor timelines
+        in: query
+        required: false
+        schema:
+          type: boolean
    post:
      description: |
        Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -481,6 +481,7 @@ async fn build_timeline_info_common(

        state,
        is_archived: Some(is_archived),
+        rel_size_migration: Some(timeline.get_rel_size_v2_status()),

        walreceiver_status,
    };
@@ -1435,6 +1436,7 @@ async fn timeline_download_heatmap_layers_handler(

    let desired_concurrency =
        parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
+    let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false);

    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

@@ -1451,9 +1453,7 @@ async fn timeline_download_heatmap_layers_handler(
        .unwrap_or(DEFAULT_MAX_CONCURRENCY);
    let concurrency = std::cmp::min(max_concurrency, desired_concurrency);

-    timeline
-        .start_heatmap_layers_download(concurrency, &ctx)
-        .await?;
+    timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?;

    json_response(StatusCode::ACCEPTED, ())
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -392,10 +392,6 @@ impl TimelineHandles {
            .await
            .map_err(|e| match e {
                timeline::handle::GetError::TenantManager(e) => e,
-                timeline::handle::GetError::TimelineGateClosed => {
-                    trace!("timeline gate closed");
-                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
-                }
                timeline::handle::GetError::PerTimelineStateShutDown => {
                    trace!("per-timeline state shut down");
                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
@@ -422,24 +418,33 @@ pub(crate) struct TenantManagerTypes;
 impl timeline::handle::Types for TenantManagerTypes {
    type TenantManagerError = GetActiveTimelineError;
    type TenantManager = TenantManagerWrapper;
-    type Timeline = Arc<Timeline>;
+    type Timeline = TenantManagerCacheItem;
 }

-impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
-    fn gate(&self) -> &utils::sync::gate::Gate {
-        &self.gate
-    }
+pub(crate) struct TenantManagerCacheItem {
+    pub(crate) timeline: Arc<Timeline>,
+    #[allow(dead_code)] // we store it to keep the gate open
+    pub(crate) gate_guard: GateGuard,
+}

+impl std::ops::Deref for TenantManagerCacheItem {
+    type Target = Arc<Timeline>;
+    fn deref(&self) -> &Self::Target {
+        &self.timeline
+    }
+}
+
+impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
    fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
-        Timeline::shard_timeline_id(self)
+        Timeline::shard_timeline_id(&self.timeline)
    }

    fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
-        &self.handles
+        &self.timeline.handles
    }

    fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
-        Timeline::get_shard_identity(self)
+        Timeline::get_shard_identity(&self.timeline)
    }
 }

@@ -448,7 +453,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        &self,
        timeline_id: TimelineId,
        shard_selector: ShardSelector,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    ) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
        let tenant_id = self.tenant_id.get().expect("we set this in get()");
        let timeout = ACTIVE_TENANT_TIMEOUT;
        let wait_start = Instant::now();
@@ -491,7 +496,20 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
        let timeline = tenant_shard
            .get_timeline(timeline_id, true)
            .map_err(GetActiveTimelineError::Timeline)?;
-        Ok(timeline)
+
+        let gate_guard = match timeline.gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => {
+                return Err(GetActiveTimelineError::Timeline(
+                    GetTimelineError::ShuttingDown,
+                ));
+            }
+        };
+
+        Ok(TenantManagerCacheItem {
+            timeline,
+            gate_guard,
+        })
    }
 }

@@ -2095,6 +2113,7 @@ impl PageServerHandler {
                // TODO: passthrough the error site to the final error message?
                BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
                BasebackupError::Server(e) => QueryError::Other(e),
+                BasebackupError::Shutdown => QueryError::Shutdown,
            }
        }

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
        // Otherwise, read the old reldir keyspace.
        // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.

-        if self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
+            self.get_rel_size_v2_status()
+        {
            // fetch directory listing (new)
            let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
            let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
                forknum: *forknum,
            }));

-        if !self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
            return Ok(rels_v1);
        }

@@ -599,28 +602,36 @@ impl Timeline {
        let n_blocks = self
            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
            .await?;
-        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
-        for blkno in 0..n_blocks {
-            let block = self
-                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
-                .await?;
-            segment.extend_from_slice(&block[..BLCKSZ as usize]);
-        }
-        Ok(segment.freeze())
-    }

-    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
-        &self,
-        kind: SlruKind,
-        segno: u32,
-        blknum: BlockNumber,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        assert!(self.tenant_shard_id.is_shard_zero());
-        let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn, ctx).await
+        let keyspace = KeySpace::single(
+            slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
+        );
+
+        let batches = keyspace.partition(
+            self.get_shard_identity(),
+            Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+        );
+
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for batch in batches.parts {
+            let blocks = self
+                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .await?;
+
+            for (_key, block) in blocks {
+                let block = block?;
+                segment.extend_from_slice(&block[..BLCKSZ as usize]);
+            }
+        }
+
+        Ok(segment.freeze())
    }

    /// Get size of an SLRU segment
@@ -829,19 +840,41 @@ impl Timeline {
            let nblocks = self
                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                .await?;
-            for blknum in (0..nblocks).rev() {
-                let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
+
+            let keyspace = KeySpace::single(
+                slru_block_to_key(SlruKind::Clog, segno, 0)
+                    ..slru_block_to_key(SlruKind::Clog, segno, nblocks),
+            );
+
+            let batches = keyspace.partition(
+                self.get_shard_identity(),
+                Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+            );
+
+            let io_concurrency = IoConcurrency::spawn_from_conf(
+                self.conf,
+                self.gate
+                    .enter()
+                    .map_err(|_| PageReconstructError::Cancelled)?,
+            );
+
+            for batch in batches.parts.into_iter().rev() {
+                let blocks = self
+                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
                    .await?;

-                if clog_page.len() == BLCKSZ as usize + 8 {
-                    let mut timestamp_bytes = [0u8; 8];
-                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
-                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+                for (_key, clog_page) in blocks.into_iter().rev() {
+                    let clog_page = clog_page?;

-                    match f(timestamp) {
-                        ControlFlow::Break(b) => return Ok(b),
-                        ControlFlow::Continue(()) => (),
+                    if clog_page.len() == BLCKSZ as usize + 8 {
+                        let mut timestamp_bytes = [0u8; 8];
+                        timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
+                        let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+
+                        match f(timestamp) {
+                            ControlFlow::Break(b) => return Ok(b),
+                            ControlFlow::Continue(()) => (),
+                        }
                    }
                }
            }
@@ -1052,6 +1085,8 @@ impl Timeline {
    ) -> Result<u64, CalculateLogicalSizeError> {
        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();

+        fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) });
+
        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
        let dbdir = DbDirectory::des(&buf)?;
@@ -1718,6 +1753,35 @@ impl DatadirModification<'_> {
        Ok(())
    }

+    /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
+    /// we enable it, we also need to persist it in `index_part.json`.
+    pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
+        let status = self.tline.get_rel_size_v2_status();
+        let config = self.tline.get_rel_size_v2_enabled();
+        match (config, status) {
+            (false, RelSizeMigration::Legacy) => {
+                // tenant config didn't enable it and we didn't write any reldir_v2 key yet
+                Ok(false)
+            }
+            (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                Ok(true)
+            }
+            (true, RelSizeMigration::Legacy) => {
+                // The first time we enable it, we need to persist it in `index_part.json`
+                self.tline
+                    .update_rel_size_v2_status(RelSizeMigration::Migrating)?;
+                tracing::info!("enabled rel_size_v2");
+                Ok(true)
+            }
+            (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                // and we don't need to do anything
+                Ok(true)
+            }
+        }
+    }
+
    /// Store a relmapper file (pg_filenode.map) in the repository
    pub async fn put_relmap_file(
        &mut self,
@@ -1726,6 +1790,8 @@ impl DatadirModification<'_> {
        img: Bytes,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+
        // Add it to the directory (if it doesn't exist already)
        let buf = self.get(DBDIR_KEY, ctx).await?;
        let mut dbdir = DbDirectory::des(&buf)?;
@@ -1746,7 +1812,7 @@ impl DatadirModification<'_> {
            })?;
            self.pending_directory_entries
                .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
-            if self.tline.get_rel_size_v2_enabled() {
+            if v2_enabled {
                self.pending_directory_entries
                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
            }
@@ -1903,7 +1969,9 @@ impl DatadirModification<'_> {
            return Err(RelationError::AlreadyExists);
        }

-        if self.tline.get_rel_size_v2_enabled() {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+
+        if v2_enabled {
            let sparse_rel_dir_key =
                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
            // check if the rel_dir_key exists in v2
@@ -2029,6 +2097,7 @@ impl DatadirModification<'_> {
        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
        for ((spc_node, db_node), rel_tags) in drop_relations {
            let dir_key = rel_dir_to_key(spc_node, db_node);
            let buf = self.get(dir_key, ctx).await?;
@@ -2041,7 +2110,7 @@ impl DatadirModification<'_> {
                        .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                    dirty = true;
                    true
-                } else if self.tline.get_rel_size_v2_enabled() {
+                } else if v2_enabled {
                    // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
                    // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
                    // logic).
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,8 +31,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pageserver_api::models;
 pub use pageserver_api::models::TenantState;
+use pageserver_api::models::{self, RelSizeMigration};
 use pageserver_api::models::{
    CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
    WalRedoManagerStatus,
@@ -1123,6 +1123,7 @@ impl Tenant {
            CreateTimelineCause::Load,
            idempotency.clone(),
            index_part.gc_compaction.clone(),
+            index_part.rel_size_migration.clone(),
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
@@ -1153,12 +1154,15 @@ impl Tenant {
            let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
            while let Some((tline, end_lsn)) = tline_ending_at {
                let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
-                if !tline.is_previous_heatmap_active() {
+                // Another unearchived timeline might have generated a heatmap for this ancestor.
+                // If the current branch point greater than the previous one use the the heatmap
+                // we just generated - it should include more layers.
+                if !tline.should_keep_previous_heatmap(end_lsn) {
                    tline
                        .previous_heatmap
                        .store(Some(Arc::new(unarchival_heatmap)));
                } else {
-                    tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.")
+                    tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.")
                }

                match tline.ancestor_timeline() {
@@ -1939,6 +1943,7 @@ impl Tenant {
                hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
                    heatmap: h,
                    read_at: hs.1,
+                    end_lsn: None,
                })
            });
            part_downloads.spawn(
@@ -2497,6 +2502,7 @@ impl Tenant {
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
+        in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
@@ -2518,6 +2524,11 @@ impl Tenant {
                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                .await?;
        }
+        for in_memory in in_memory_layer_desc {
+            tline
+                .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
+                .await?;
+        }
        let layer_names = tline
            .layers
            .read()
@@ -4118,6 +4129,7 @@ impl Tenant {
        cause: CreateTimelineCause,
        create_idempotency: CreateTimelineIdempotency,
        gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
    ) -> anyhow::Result<Arc<Timeline>> {
        let state = match cause {
            CreateTimelineCause::Load => {
@@ -4150,6 +4162,7 @@ impl Tenant {
            self.attach_wal_lag_cooldown.clone(),
            create_idempotency,
            gc_compaction_state,
+            rel_size_v2_status,
            self.cancel.child_token(),
        );

@@ -5221,6 +5234,7 @@ impl Tenant {
                CreateTimelineCause::Load,
                create_guard.idempotency.clone(),
                None,
+                None,
            )
            .context("Failed to create timeline data structure")?;

@@ -5909,6 +5923,8 @@ mod tests {
    #[cfg(feature = "testing")]
    use timeline::GcInfo;
    #[cfg(feature = "testing")]
+    use timeline::InMemoryLayerTestDesc;
+    #[cfg(feature = "testing")]
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    use timeline::{CompactOptions, DeltaLayerTestDesc};
    use utils::id::TenantId;
@@ -7921,6 +7937,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(), // delta layers
                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
@@ -8008,6 +8025,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(), // delta layers
                vec![(
                    Lsn(0x20),
@@ -8223,6 +8241,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8303,6 +8322,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8376,6 +8396,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8508,6 +8529,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -8701,6 +8723,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    Lsn(0x10)..Lsn(0x40),
                    delta1,
@@ -8757,6 +8780,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                Vec::new(),
                image_layers,
                end_lsn,
@@ -8963,6 +8987,7 @@ mod tests {
                    Lsn(0x08),
                    DEFAULT_PG_VERSION,
                    &ctx,
+                    Vec::new(), // in-memory layers
                    vec![
                        DeltaLayerTestDesc::new_with_inferred_key_range(
                            Lsn(0x08)..Lsn(0x10),
@@ -8981,7 +9006,7 @@ mod tests {
                            delta3,
                        ),
                    ], // delta layers
-                    vec![], // image layers
+                    vec![],     // image layers
                    Lsn(0x50),
                )
                .await?
@@ -8992,6 +9017,7 @@ mod tests {
                    Lsn(0x10),
                    DEFAULT_PG_VERSION,
                    &ctx,
+                    Vec::new(), // in-memory layers
                    vec![
                        DeltaLayerTestDesc::new_with_inferred_key_range(
                            Lsn(0x10)..Lsn(0x48),
@@ -9542,6 +9568,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
@@ -9789,6 +9816,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                Vec::new(), // in-memory layers
                vec![
                    // delta1 and delta 2 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
@@ -10024,6 +10052,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![],                       // in-memory layers
                vec![],                       // delta layers
                vec![(Lsn(0x18), img_layer)], // image layers
                Lsn(0x18),
@@ -10270,6 +10299,7 @@ mod tests {
                baseline_image_layer_lsn,
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    delta_layer_start_lsn..delta_layer_end_lsn,
                    delta_layer_spec,
@@ -10301,6 +10331,158 @@ mod tests {
        Ok(())
    }

+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
+        let harness =
+            TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let frozen_layer = {
+            let lsn_range = Lsn(0x40)..Lsn(0x60);
+            let mut data = Vec::new();
+            for i in 0..10 {
+                let key = get_key(i);
+                let key_in_nested = nested_img_layer
+                    .iter()
+                    .any(|(key_with_img, _)| *key_with_img == key);
+                let lsn = {
+                    if key_in_nested {
+                        Lsn(nested_image_layer_lsn.0 + 5)
+                    } else {
+                        lsn_range.start
+                    }
+                };
+
+                let will_init = will_init_keys.contains(&i);
+                if will_init {
+                    data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
+
+                    expected_key_values.insert(key, "".to_string());
+                } else {
+                    let delta = format!("@{lsn}");
+                    data.push((
+                        key,
+                        lsn,
+                        Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                    ));
+
+                    expected_key_values
+                        .get_mut(&key)
+                        .expect("An image exists for each key")
+                        .push_str(delta.as_str());
+                }
+            }
+
+            InMemoryLayerTestDesc {
+                lsn_range,
+                is_open: false,
+                data,
+            }
+        };
+
+        let (open_layer, last_record_lsn) = {
+            let start_lsn = Lsn(0x70);
+            let mut data = Vec::new();
+            let mut end_lsn = Lsn(0);
+            for i in 0..10 {
+                let key = get_key(i);
+                let lsn = Lsn(start_lsn.0 + i as u64);
+                let delta = format!("@{lsn}");
+                data.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+
+                end_lsn = std::cmp::max(end_lsn, lsn);
+            }
+
+            (
+                InMemoryLayerTestDesc {
+                    lsn_range: start_lsn..Lsn::MAX,
+                    is_open: true,
+                    data,
+                },
+                end_lsn,
+            )
+        };
+
+        assert!(
+            nested_image_layer_lsn > frozen_layer.lsn_range.start
+                && nested_image_layer_lsn < frozen_layer.lsn_range.end
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![open_layer, frozen_layer], // in-memory layers
+                Vec::new(),                     // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                last_record_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value.clone()));
+
+            tracing::info!("key={key} value={expected_value}");
+        }
+
+        Ok(())
+    }
+
    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
        (
            k1.is_delta,
@@ -10416,6 +10598,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -10800,6 +10983,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    // delta1/2/4 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
@@ -11051,6 +11235,7 @@ mod tests {
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
+                vec![], // in-memory layers
                vec![
                    // delta1/2/4 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
    /// The latest state
    head: LayerCoverageTuple<Value>,

+    /// TODO: this could be an ordered vec using binary search.
+    /// We push into this map everytime we add a layer, so might see some benefit
    /// All previous states
    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
 }
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
    buffer: BTreeMap<LayerKey, Option<Value>>,

    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
+    // [`Self::historic_coverage`] instead of doubling memory usage.
+    // [`Self::len`]: can require rebuild and serve from latest historic
+    // [`Self::iter`]: already requires rebuild => can serve from latest historic
    layers: BTreeMap<LayerKey, Value>,
 }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -194,7 +194,7 @@ pub(crate) use download::{
 };
 use index::GcCompactionState;
 pub(crate) use index::LayerFileMetadata;
-use pageserver_api::models::TimelineArchivalState;
+use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use regex::Regex;
 use remote_storage::{
@@ -900,7 +900,7 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
    pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
        self: &Arc<Self>,
        gc_compaction_state: GcCompactionState,
@@ -912,6 +912,21 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
+    pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
+        self: &Arc<Self>,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
+        // TODO: allow this operation to bypass the validation check because we might upload the index part
+        // with no layers but the flag updated. For now, we just modify the index part in memory and the next
+        // upload will include the flag.
+        // self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,6 +7,7 @@ use std::collections::HashMap;

 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::shard::ShardIndex;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
    pub(crate) last_completed_lsn: Lsn,
 }

-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub enum RelSizeMigration {
-    /// The tenant is using the old rel_size format.
-    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
-    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
-    Legacy,
-    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
-    /// persisted in the index part. The read path will read both formats and merge them.
-    Migrating,
-    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
-    /// in the index part, and the read path will not read the old format.
-    Migrated,
-}
-
 impl IndexPart {
    /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
    /// used to understand later versions.
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;

 use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
+use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
 use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -721,6 +722,12 @@ struct LayerToVisitId {
    lsn_floor: Lsn,
 }

+#[derive(Debug, PartialEq, Eq, Hash)]
+pub enum ReadableLayerWeak {
+    PersistentLayer(Arc<PersistentLayerDesc>),
+    InMemoryLayer(InMemoryLayerDesc),
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -873,7 +880,7 @@ impl ReadableLayer {
            }
            ReadableLayer::InMemoryLayer(layer) => {
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -416,7 +416,7 @@ impl InMemoryLayer {
    pub(crate) async fn get_values_reconstruct_data(
        self: &Arc<InMemoryLayer>,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
@@ -433,8 +433,6 @@ impl InMemoryLayer {
        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();

-        let lsn_range = self.start_lsn..end_lsn;
-
        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
                .index
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -49,6 +49,7 @@ async fn smoke_test() {
            Lsn(0x10),
            14,
            &ctx,
+            Default::default(), // in-memory layers
            Default::default(),
            image_layers,
            Lsn(0x100),
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -473,21 +473,15 @@ async fn wait_for_active_tenant(
    }

    let mut update_rx = tenant.subscribe_for_state_updates();
-    loop {
-        tokio::select! {
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            result = update_rx.changed() => if result.is_err() {
+    tokio::select! {
+        result = update_rx.wait_for(|s| s == &TenantState::Active) => {
+            if result.is_err() {
                return ControlFlow::Break(());
            }
-        }
-
-        match &*update_rx.borrow() {
-            TenantState::Active => {
-                debug!("Tenant state changed to active, continuing the task loop");
-                return ControlFlow::Continue(());
-            }
-            state => debug!("Not running the task loop, tenant is not active: {state:?}"),
-        }
+            debug!("Tenant state changed to active, continuing the task loop");
+            ControlFlow::Continue(())
+        },
+        _ = cancel.cancelled() => ControlFlow::Break(()),
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
 use pageserver_api::models::{
    CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
+    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
 };
 use pageserver_api::reltag::{BlockNumber, RelTag};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -436,12 +436,16 @@ pub struct Timeline {
    /// May host a background Tokio task which downloads all the layers from the current
    /// heatmap on demand.
    heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
+
+    pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
 }

 pub(crate) enum PreviousHeatmap {
    Active {
        heatmap: HeatMapTimeline,
        read_at: std::time::Instant,
+        // End LSN covered by the heatmap if known
+        end_lsn: Option<Lsn>,
    },
    Obsolete,
 }
@@ -2366,6 +2370,9 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
+    /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
+    /// possible that the index part persists the state while the config doesn't get persisted.
    pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2374,6 +2381,14 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
    }

+    pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
+        self.rel_size_v2_status
+            .load()
+            .as_ref()
+            .map(|s| s.as_ref().clone())
+            .unwrap_or(RelSizeMigration::Legacy)
+    }
+
    fn get_compaction_upper_limit(&self) -> usize {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2634,6 +2649,7 @@ impl Timeline {
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        create_idempotency: crate::tenant::CreateTimelineIdempotency,
        gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2792,6 +2808,8 @@ impl Timeline {
                previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),

                heatmap_layers_downloader: Mutex::new(None),
+
+                rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
            };

            result.repartition_threshold =
@@ -2868,6 +2886,16 @@ impl Timeline {
            .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
    }

+    pub(crate) fn update_rel_size_v2_status(
+        &self,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        self.rel_size_v2_status
+            .store(Some(Arc::new(rel_size_v2_status.clone())));
+        self.remote_client
+            .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
+    }
+
    pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
        self.gc_compaction_state.load_full().as_ref().clone()
    }
@@ -3570,12 +3598,16 @@ impl Timeline {
        Ok(layer)
    }

-    pub(super) fn is_previous_heatmap_active(&self) -> bool {
-        self.previous_heatmap
-            .load()
-            .as_ref()
-            .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. }))
-            .unwrap_or(false)
+    pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool {
+        let crnt = self.previous_heatmap.load();
+        match crnt.as_deref() {
+            Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn {
+                Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn,
+                None => true,
+            },
+            Some(PreviousHeatmap::Obsolete) => false,
+            None => false,
+        }
    }

    /// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -3603,26 +3635,26 @@ impl Timeline {
        // heatamp.
        let previous_heatmap = self.previous_heatmap.load();
        let visible_non_resident = match previous_heatmap.as_deref() {
-            Some(PreviousHeatmap::Active { heatmap, read_at }) => {
-                Some(heatmap.layers.iter().filter_map(|hl| {
-                    let desc: PersistentLayerDesc = hl.name.clone().into();
-                    let layer = guard.try_get_from_key(&desc.key())?;
+            Some(PreviousHeatmap::Active {
+                heatmap, read_at, ..
+            }) => Some(heatmap.layers.iter().filter_map(|hl| {
+                let desc: PersistentLayerDesc = hl.name.clone().into();
+                let layer = guard.try_get_from_key(&desc.key())?;

-                    if layer.visibility() == LayerVisibilityHint::Covered {
-                        return None;
-                    }
+                if layer.visibility() == LayerVisibilityHint::Covered {
+                    return None;
+                }

-                    if layer.is_likely_resident() {
-                        return None;
-                    }
+                if layer.is_likely_resident() {
+                    return None;
+                }

-                    if layer.last_evicted_at().happened_after(*read_at) {
-                        return None;
-                    }
+                if layer.last_evicted_at().happened_after(*read_at) {
+                    return None;
+                }

-                    Some((desc, hl.metadata.clone(), hl.access_time))
-                }))
-            }
+                Some((desc, hl.metadata.clone(), hl.access_time))
+            })),
            Some(PreviousHeatmap::Obsolete) => None,
            None => None,
        };
@@ -3709,6 +3741,7 @@ impl Timeline {
        PreviousHeatmap::Active {
            heatmap,
            read_at: Instant::now(),
+            end_lsn: Some(end_lsn),
        }
    }

@@ -3907,39 +3940,22 @@ impl Timeline {
                let guard = timeline.layers.read().await;
                let layers = guard.layer_map()?;

-                let in_memory_layer = layers.find_in_memory_layer(|l| {
-                    let start_lsn = l.get_lsn_range().start;
-                    cont_lsn > start_lsn
-                });
+                for range in unmapped_keyspace.ranges.iter() {
+                    let results = layers.range_search(range.clone(), cont_lsn);

-                match in_memory_layer {
-                    Some(l) => {
-                        let lsn_range = l.get_lsn_range().start..cont_lsn;
-                        fringe.update(
-                            ReadableLayer::InMemoryLayer(l),
-                            unmapped_keyspace.clone(),
-                            lsn_range,
-                        );
-                    }
-                    None => {
-                        for range in unmapped_keyspace.ranges.iter() {
-                            let results = layers.range_search(range.clone(), cont_lsn);
-
-                            results
-                                .found
-                                .into_iter()
-                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                    (
-                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                        keyspace_accum.to_keyspace(),
-                                        lsn_floor..cont_lsn,
-                                    )
-                                })
-                                .for_each(|(layer, keyspace, lsn_range)| {
-                                    fringe.update(layer, keyspace, lsn_range)
-                                });
-                        }
-                    }
+                    results
+                        .found
+                        .into_iter()
+                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                            (
+                                guard.upgrade(layer),
+                                keyspace_accum.to_keyspace(),
+                                lsn_floor..cont_lsn,
+                            )
+                        })
+                        .for_each(|(layer, keyspace, lsn_range)| {
+                            fringe.update(layer, keyspace, lsn_range)
+                        });
                }

                // It's safe to drop the layer map lock after planning the next round of reads.
@@ -5548,6 +5564,14 @@ pub struct DeltaLayerTestDesc {
    pub data: Vec<(Key, Lsn, Value)>,
 }

+#[cfg(test)]
+#[derive(Clone)]
+pub struct InMemoryLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub data: Vec<(Key, Lsn, Value)>,
+    pub is_open: bool,
+}
+
 #[cfg(test)]
 impl DeltaLayerTestDesc {
    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
@@ -6560,6 +6584,92 @@ impl Timeline {
        Ok(())
    }

+    /// Force create an in-memory layer and place them into the layer map.
+    #[cfg(test)]
+    pub(super) async fn force_create_in_memory_layer(
+        self: &Arc<Timeline>,
+        mut in_memory: InMemoryLayerTestDesc,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+
+        // Validate LSNs
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(in_memory.lsn_range.start >= check_start_lsn);
+        }
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let layer_end_lsn = if in_memory.is_open {
+            in_memory
+                .data
+                .iter()
+                .map(|(_key, lsn, _value)| lsn)
+                .max()
+                .cloned()
+        } else {
+            Some(in_memory.lsn_range.end)
+        };
+
+        if let Some(end) = layer_end_lsn {
+            assert!(
+                end <= last_record_lsn,
+                "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+                end,
+                last_record_lsn,
+            );
+        }
+
+        in_memory.data.iter().for_each(|(_key, lsn, _value)| {
+            assert!(*lsn >= in_memory.lsn_range.start);
+            assert!(*lsn < in_memory.lsn_range.end);
+        });
+
+        // Build the batch
+        in_memory
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+
+        let data = in_memory
+            .data
+            .into_iter()
+            .map(|(key, lsn, value)| {
+                let value_size = value.serialized_size().unwrap() as usize;
+                (key.to_compact(), lsn, value_size, value)
+            })
+            .collect::<Vec<_>>();
+
+        let batch = SerializedValueBatch::from_values(data);
+
+        // Create the in-memory layer and write the batch into it
+        let layer = InMemoryLayer::create(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            in_memory.lsn_range.start,
+            &self.gate,
+            ctx,
+        )
+        .await
+        .unwrap();
+
+        layer.put_batch(batch, ctx).await.unwrap();
+        if !in_memory.is_open {
+            layer.freeze(in_memory.lsn_range.end).await;
+        }
+
+        info!("force created in-memory layer {:?}", in_memory.lsn_range);
+
+        // Link the layer to the layer map
+        {
+            let mut guard = self.layers.write().await;
+            let layer_map = guard.open_mut().unwrap();
+            layer_map.force_insert_in_memory_layer(Arc::new(layer));
+        }
+
+        Ok(())
+    }
+
    /// Return all keys at the LSN in the image layers
    #[cfg(test)]
    pub(crate) async fn inspect_image_layers(
@@ -6992,6 +7102,7 @@ mod tests {
                Lsn(0x10),
                14,
                &ctx,
+                Vec::new(), // in-memory layers
                delta_layers,
                image_layers,
                Lsn(0x100),
@@ -7046,6 +7157,7 @@ mod tests {
            .store(Some(Arc::new(PreviousHeatmap::Active {
                heatmap: heatmap.clone(),
                read_at: std::time::Instant::now(),
+                end_lsn: None,
            })));

        // Generate a new heatmap and assert that it contains the same layers as the old one.
@@ -7124,6 +7236,7 @@ mod tests {
                Lsn(0x10),
                14,
                &ctx,
+                Vec::new(), // in-memory layers
                delta_layers,
                image_layers,
                Lsn(0x100),
@@ -7148,6 +7261,7 @@ mod tests {
            .store(Some(Arc::new(PreviousHeatmap::Active {
                heatmap: heatmap.clone(),
                read_at: std::time::Instant::now(),
+                end_lsn: None,
            })));

        // Evict all the layers in the previous heatmap
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -321,7 +321,7 @@ impl GcCompactionQueue {
                l1_size, l2_size, l2_lsn, gc_cutoff
            );
        } else {
-            info!(
+            debug!(
                "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
                l1_size, l2_size, l2_lsn, gc_cutoff
            );
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -306,6 +306,7 @@ impl DeleteTimelineFlow {
                CreateTimelineCause::Delete,
                crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
                None, // doesn't matter what we put here
+                None, // doesn't matter what we put here
            )
            .context("create_timeline_struct")?;

--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,5 +1,4 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
+//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
 //!
 //! # Motivation
 //!
@@ -19,27 +18,32 @@
 //! we hold the Timeline gate open while we're invoking the method on the
 //! Timeline object.
 //!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
+//! We want to avoid the overhead of doing, for each incoming request,
+//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//!   release the mgr rwlock before doing any request processing work
+//! - re-entering the Timeline gate for each Timeline method invocation.
 //!
 //! Regardless of how we accomplish the above, it should not
 //! prevent the Timeline from shutting down promptly.
 //!
+//!
 //! # Design
 //!
 //! ## Data Structures
 //!
-//! There are three user-facing data structures:
+//! There are two concepts expressed as associated types in the `Types` trait:
+//! - `TenantManager`: the thing that performs the expensive work. It produces
+//!   a `Timeline` object, which is the other associated type.
+//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
+//!
+//! There are three user-facing data structures exposed by this module:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
+//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
 //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
-//!   trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
+//!   trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
+//!   point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
 //!
 //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
 //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
@@ -64,11 +68,14 @@
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
-//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
+//! A cache miss means we call Types::TenantManager::resolve for shard routing,
+//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
+//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
+//!
+//! We wrap the object returned from resolve() in an `Arc` and store that inside the
+//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
 //! and a strong ref in the `PerTimelineState`.
-//! A strong ref is returned wrapped in a `Handle`.
+//! Another strong ref is returned wrapped in a `Handle`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
 //! and find the weak ref in the cache.
@@ -78,51 +85,51 @@
 //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
 //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
 //! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
+//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
 //!
 //! # Performance
 //!
 //! Remember from the introductory section:
 //!
-//! > However, we want to avoid the overhead of entering the gate for every
-//! > method invocation.
+//! > We want to avoid the overhead of doing, for each incoming request,
+//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//! >   release the mgr rwlock before doing any request processing work
+//! > - re-entering the Timeline gate for each Timeline method invocation.
 //!
-//! Why do we want to avoid that?
-//! Because the gate is a shared location in memory and entering it involves
-//! bumping refcounts, which leads to cache contention if done frequently
-//! from multiple cores in parallel.
+//! All of these boil down to some state that is either globally shared among all shards
+//! or state shared among all tasks that serve a particular timeline.
+//! It is either protected by RwLock or manipulated via atomics.
+//! Even atomics are costly when shared across multiple cores.
+//! So, we want to avoid any permanent need for coordination between page_service tasks.
 //!
-//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
-//! That `Arc` is private to the `HandleInner` and hence to the connection.
+//! The solution is to add indirection: we wrap the Types::Timeline object that is
+//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
+//! and hence to the single Cache / page_service connection.
 //! (Review the "Data Structures" section if that is unclear to you.)
 //!
-//! A `WeakHandle` is a weak ref to the `HandleInner`.
-//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
-//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
-//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
 //!
-//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
-//! Again, this is cheap because the `Arc` is private to the connection.
+//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
+//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
+//! The Mutex is not contended because it is private to the connection.
+//! And again, the  `Arc<Types::Timeline>` clone is cheap because that wrapper
+//! Arc's refcounts are private to the connection.
+//!
+//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
 //!
-//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
-//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
-//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
-//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
-//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
-//! so that we can clone it cheaply when upgrading a `WeakHandle`.
 //!
 //! # Shutdown
 //!
 //! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
 //! ```
 //!
 //! Further, there is this cycle:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
 //! ```
 //!
 //! The former cycle is a memory leak if not broken.
@@ -135,9 +142,12 @@
 //! - Timeline shutdown (=> `PerTimelineState::shutdown`)
 //! - Connection shutdown (=> dropping the `Cache`).
 //!
-//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
-//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
-//! `Arc<GateGuard>`.
+//! Both transition the `HandleInner` from [`HandleInner::Open`] to
+//! [`HandleInner::ShutDown`], which drops the only long-lived
+//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
+//! is dropped, the `Types::Timeline` gets dropped and thereby
+//! the `GateGuard` and the `Arc<Timeline>` that it stores,
+//! thereby breaking both cycles.
 //!
 //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
 //! thereby breaking the cycle.
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
 pub(crate) trait Types: Sized + std::fmt::Debug {
    type TenantManagerError: Sized + std::fmt::Debug;
    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
+    type Timeline: Timeline<Self> + Sized;
 }

 /// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {

 /// See module-level comment.
 pub(crate) struct Handle<T: Types> {
-    timeline: Arc<T::Timeline>,
-    #[allow(dead_code)] // the field exists to keep the gate open
-    gate_guard: Arc<utils::sync::gate::GateGuard>,
    inner: Arc<Mutex<HandleInner<T>>>,
+    open: Arc<T::Timeline>,
 }
 pub(crate) struct WeakHandle<T: Types> {
    inner: Weak<Mutex<HandleInner<T>>>,
 }
+
 enum HandleInner<T: Types> {
-    KeepingTimelineGateOpen {
-        #[allow(dead_code)]
-        gate_guard: Arc<utils::sync::gate::GateGuard>,
-        timeline: Arc<T::Timeline>,
-    },
+    Open(Arc<T::Timeline>),
    ShutDown,
 }

@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
 }

 /// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
+pub(crate) trait Timeline<T: Types> {
    fn shard_timeline_id(&self) -> ShardTimelineId;
    fn get_shard_identity(&self) -> &ShardIdentity;
    fn per_timeline_state(&self) -> &PerTimelineState<T>;
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
 #[derive(Debug)]
 pub(crate) enum GetError<T: Types> {
    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
    PerTimelineStateShutDown,
 }

@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
                }

                trace!("creating new HandleInner");
-                let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
-                    gate_guard: Arc::new(
-                        // this enter() is expensive in production code because
-                        // it hits the global Arc<Timeline>::gate refcounts
-                        match timeline.gate().enter() {
-                            Ok(guard) => guard,
-                            Err(_) => {
-                                return Err(GetError::TimelineGateClosed);
-                            }
-                        },
-                    ),
-                    // this clone is expensive in production code because
-                    // it hits the global Arc<Timeline>::clone refcounts
-                    timeline: Arc::new(timeline.clone()),
-                }));
+                let timeline = Arc::new(timeline);
+                let handle_inner_arc =
+                    Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
                let handle_weak = WeakHandle {
                    inner: Arc::downgrade(&handle_inner_arc),
                };
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
        };
        let lock_guard = inner.lock().expect("poisoned");
        match &*lock_guard {
-            HandleInner::KeepingTimelineGateOpen {
-                timeline,
-                gate_guard,
-            } => {
-                let gate_guard = Arc::clone(gate_guard);
-                let timeline = Arc::clone(timeline);
+            HandleInner::Open(open) => {
+                let open = Arc::clone(open);
                drop(lock_guard);
-                Ok(Handle {
-                    timeline,
-                    gate_guard,
-                    inner,
-                })
+                Ok(Handle { open, inner })
            }
            HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
        }
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
 impl<T: Types> std::ops::Deref for Handle<T> {
    type Target = T::Timeline;
    fn deref(&self) -> &Self::Target {
-        &self.timeline
+        &self.open
    }
 }

@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
    /// to the [`Types::Timeline`] that embeds this per-timeline state.
    /// Even if [`TenantManager::resolve`] would still resolve to it.
    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
+    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
    /// That's ok because they're short-lived. See module-level comment for details.
    #[instrument(level = "trace", skip_all)]
    pub(super) fn shutdown(&self) {
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
 impl<T: Types> HandleInner<T> {
    fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
        match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
+            HandleInner::Open(timeline) => Some(timeline),
            HandleInner::ShutDown => {
                // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
                // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
@@ -631,6 +614,7 @@ mod tests {
    use pageserver_api::reltag::RelTag;
    use pageserver_api::shard::ShardStripeSize;
    use utils::shard::ShardCount;
+    use utils::sync::gate::GateGuard;

    use super::*;

@@ -641,7 +625,7 @@ mod tests {
    impl Types for TestTypes {
        type TenantManagerError = anyhow::Error;
        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
+        type Timeline = Entered;
    }

    struct StubManager {
@@ -656,17 +640,19 @@ mod tests {
        myself: Weak<StubTimeline>,
    }

+    struct Entered {
+        timeline: Arc<StubTimeline>,
+        #[allow(dead_code)] // it's stored here to keep the gate open
+        gate_guard: Arc<GateGuard>,
+    }
+
    impl StubTimeline {
        fn getpage(&self) {
            // do nothing
        }
    }

-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
+    impl Timeline<TestTypes> for Entered {
        fn shard_timeline_id(&self) -> ShardTimelineId {
            ShardTimelineId {
                shard_index: self.shard.shard_index(),
@@ -688,20 +674,34 @@ mod tests {
            &self,
            timeline_id: TimelineId,
            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
+        ) -> anyhow::Result<Entered> {
            for timeline in &self.shards {
                if timeline.id == timeline_id {
+                    let enter_gate = || {
+                        let gate_guard = timeline.gate.enter()?;
+                        let gate_guard = Arc::new(gate_guard);
+                        anyhow::Ok(gate_guard)
+                    };
                    match &shard_selector {
                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Zero => continue,
                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Page(_) => continue,
                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                        }
                        ShardSelector::Known(_) => continue,
                    }
@@ -711,6 +711,13 @@ mod tests {
        }
    }

+    impl std::ops::Deref for Entered {
+        type Target = StubTimeline;
+        fn deref(&self) -> &Self::Target {
+            &self.timeline
+        }
+    }
+
    #[tokio::test(start_paused = true)]
    async fn test_timeline_shutdown() {
        crate::tenant::harness::setup_logging();
@@ -1038,7 +1045,6 @@ mod tests {
        let key = DBDIR_KEY;

        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
        for _ in 0..10 {
            let mut cache = Cache::<TestTypes>::default();
            let handle = {
@@ -1050,7 +1056,6 @@ mod tests {
                handle
            };
            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.timeline));
        }

        // No handles exist, thus gates are closed and don't require shutdown.
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -32,6 +32,7 @@ impl HeatmapLayersDownloader {
    fn new(
        timeline: Arc<Timeline>,
        concurrency: usize,
+        recurse: bool,
        ctx: RequestContext,
    ) -> Result<HeatmapLayersDownloader, ApiError> {
        let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
@@ -98,6 +99,20 @@ impl HeatmapLayersDownloader {
                    },
                    _ = cancel.cancelled() => {
                        tracing::info!("Heatmap layers download cancelled");
+                        return;
+                    }
+                }
+
+                if recurse {
+                    if let Some(ancestor) = timeline.ancestor_timeline() {
+                        let ctx = ctx.attached_child();
+                        let res =
+                            ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx);
+                        if let Err(err) = res {
+                            tracing::info!(
+                                "Failed to start heatmap layers download for ancestor: {err}"
+                            );
+                        }
                    }
                }
            }
@@ -140,14 +155,20 @@ impl HeatmapLayersDownloader {
 }

 impl Timeline {
-    pub(crate) async fn start_heatmap_layers_download(
+    pub(crate) fn start_heatmap_layers_download(
        self: &Arc<Self>,
        concurrency: usize,
+        recurse: bool,
        ctx: &RequestContext,
    ) -> Result<(), ApiError> {
        let mut locked = self.heatmap_layers_downloader.lock().unwrap();
        if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
-            let dl = HeatmapLayersDownloader::new(self.clone(), concurrency, ctx.attached_child())?;
+            let dl = HeatmapLayersDownloader::new(
+                self.clone(),
+                concurrency,
+                recurse,
+                ctx.attached_child(),
+            )?;
            *locked = Some(dl);
            Ok(())
        } else {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,14 +8,14 @@ use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};

-use super::TimelineWriterState;
+use super::{ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
 use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
 use crate::tenant::storage_layer::{
    AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
-    PersistentLayerKey, ResidentLayer,
+    PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
 };

 /// Provides semantic APIs to manipulate the layer map.
@@ -37,6 +37,21 @@ impl Default for LayerManager {
 }

 impl LayerManager {
+    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+        match weak {
+            ReadableLayerWeak::PersistentLayer(desc) => {
+                ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
+            }
+            ReadableLayerWeak::InMemoryLayer(desc) => {
+                let inmem = self
+                    .layer_map()
+                    .expect("no concurrent shutdown")
+                    .in_memory_layer(&desc);
+                ReadableLayer::InMemoryLayer(inmem)
+            }
+        }
+    }
+
    pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
        // The assumption for the `expect()` is that all code maintains the following invariant:
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
@@ -470,6 +485,25 @@ impl OpenLayerManager {
        mapping.remove(layer);
        layer.delete_on_drop();
    }
+
+    #[cfg(test)]
+    pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
+        use pageserver_api::models::InMemoryLayerInfo;
+
+        match layer.info() {
+            InMemoryLayerInfo::Open { .. } => {
+                assert!(self.layer_map.open_layer.is_none());
+                self.layer_map.open_layer = Some(layer);
+            }
+            InMemoryLayerInfo::Frozen { lsn_start, .. } => {
+                if let Some(last) = self.layer_map.frozen_layers.back() {
+                    assert!(last.get_lsn_range().end <= lsn_start);
+                }
+
+                self.layer_map.frozen_layers.push_back(layer);
+            }
+        }
+    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1026,6 +1026,19 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n
 			if (!neon_prefetch_response_usable(&lsns[i], slot))
 				continue;

+			/*
+			 * Ignore errors
+			 */
+			if (slot->response->tag != T_NeonGetPageResponse)
+			{
+				if (slot->response->tag != T_NeonErrorResponse)
+				{
+					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
+				}
+				continue;
+			}
 			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
 			prefetch_set_unused(ring_index);
 			BITMAP_SET(mask, i);
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -83,6 +83,7 @@ static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 static void UpdateDonorShmem(WalProposer *wp);
 static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
+static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst);
 static void MembershipConfigurationFree(MembershipConfiguration *mconf);

 WalProposer *
@@ -97,7 +98,32 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;

-	for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep)
+	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
+
+	/*
+	 * If safekeepers list starts with g# parse generation number followed by
+	 * :
+	 */
+	if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0)
+	{
+		char	   *endptr;
+
+		errno = 0;
+		wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10);
+		if (errno != 0)
+		{
+			wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
+		}
+		/* Skip past : to the first hostname. */
+		host = endptr + 1;
+	}
+	else
+	{
+		host = wp->config->safekeepers_list;
+	}
+	wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);
+
+	for (; host != NULL && *host != '\0'; host = sep)
 	{
 		port = strchr(host, ':');
 		if (port == NULL)
@@ -183,6 +209,12 @@ WalProposerFree(WalProposer *wp)
 	pfree(wp);
 }

+static bool
+WalProposerGenerationsEnabled(WalProposer *wp)
+{
+	return wp->safekeepers_generation != 0;
+}
+
 /*
 * Create new AppendRequest message and start sending it. This function is
 * called from walsender every time the new WAL is available.
@@ -600,10 +632,14 @@ static void
 SendStartWALPush(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
+
+	/* Forbid implicit timeline creation if generations are enabled. */
+	char	   *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true";
 #define CMD_LEN 512
 	char		cmd[CMD_LEN];

-	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
+
+	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation);
 	if (!wp->api.conn_send_query(sk, cmd))
 	{
 		wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s",
@@ -705,6 +741,18 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		   sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
 	pfree(mconf_toml);

+	/*
+	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
+	 * restart wp if it started voting.
+	 */
+	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
+	{
+		MembershipConfigurationFree(&wp->mconf);
+		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		/* full conf was just logged above */
+		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
+	}
+
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;

@@ -1896,7 +1944,8 @@ PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf
 						pq_sendint64_le(buf, m->termHistory->entries[i].term);
 						pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
 					}
-					/* 
+
+					/*
 					 * Removed timeline_start_lsn. Still send it as a valid
 					 * value until safekeepers taking it from term history are
 					 * deployed.
@@ -2162,7 +2211,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 		}
 	}
 	wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
-	return false; /* keep the compiler quiet */
+	return false;				/* keep the compiler quiet */
 }

 /*
@@ -2570,6 +2619,18 @@ MembershipConfigurationToString(MembershipConfiguration *mconf)
 	return s.data;
 }

+static void
+MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst)
+{
+	dst->generation = src->generation;
+	dst->members.len = src->members.len;
+	dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len);
+	memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len);
+	dst->new_members.len = src->new_members.len;
+	dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len);
+	memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len);
+}
+
 static void
 MembershipConfigurationFree(MembershipConfiguration *mconf)
 {
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -160,7 +160,10 @@ typedef struct MemberSet
 	SafekeeperId *m;			/* ids themselves */
 } MemberSet;

-/* Timeline safekeeper membership configuration. */
+/*
+ * Timeline safekeeper membership configuration as sent in the
+ * protocol.
+ */
 typedef struct MembershipConfiguration
 {
 	Generation	generation;
@@ -761,8 +764,22 @@ typedef struct WalProposer
 	/* (n_safekeepers / 2) + 1 */
 	int			quorum;

+	/*
+	 * Generation of the membership conf of which safekeepers[] are presumably
+	 * members. To make cplane life a bit easier and have more control in
+	 * tests with which sks walproposer gets connected neon.safekeepers GUC
+	 * doesn't provide full mconf, only the list of endpoints to connect to.
+	 * We still would like to know generation associated with it because 1) we
+	 * need some handle to enforce using generations in walproposer, and
+	 * non-zero value of this serves the purpose; 2) currently we don't do
+	 * that, but in theory walproposer can update list of safekeepers to
+	 * connect to upon receiving mconf from safekeepers, and generation number
+	 * must be checked to see which list is newer.
+	 */
+	Generation	safekeepers_generation;
 	/* Number of occupied slots in safekeepers[] */
 	int			n_safekeepers;
+	/* Safekeepers walproposer is connecting to. */
 	Safekeeper	safekeeper[MAX_SAFEKEEPERS];

 	/* WAL has been generated up to this point */
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -32,8 +32,8 @@

 #include "inmem_smgr.h"

-/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, but we can update up to 3 forks for each block */
-#define MAX_PAGES 100
+/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, so assume that 64 will be enough */
+#define MAX_PAGES 64

 /* If more than WARN_PAGES are used, print a warning in the log */
 #define WARN_PAGES 32
@@ -174,10 +174,7 @@ static void
 inmem_zeroextend(SMgrRelation reln, ForkNumber forknum,
 				 BlockNumber blocknum, int nblocks, bool skipFsync)
 {
-	char buffer[BLCKSZ] = {0};
-
-	for (int i = 0; i < nblocks; i++)
-		inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync);
+	/* Do nothing: inmem_read will return zero page in any case */
 }
 #endif

--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -53,7 +53,7 @@ measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry = { workspace = true, features = ["trace"] }
-papaya = "0.1.8"
+papaya = "0.2.0"
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -35,6 +35,7 @@ impl LocalBackend {
                    endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
                    project_id: ProjectIdTag::get_interner().get_or_intern("local"),
                    branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    compute_id: "local".into(),
                    cold_start_info: ColdStartInfo::WarmCached,
                },
            },
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,3 +1,4 @@
+use std::fmt::Debug;
 use std::io;
 use std::net::SocketAddr;
 use std::time::Duration;
@@ -10,7 +11,7 @@ use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
-use tokio::net::TcpStream;
+use tokio::net::{TcpStream, lookup_host};
 use tracing::{debug, error, info, warn};

 use crate::auth::backend::ComputeUserInfo;
@@ -180,21 +181,19 @@ impl ConnCfg {
        use postgres_client::config::Host;

        // wrap TcpStream::connect with timeout
-        let connect_with_timeout = |host, port| {
-            tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
-                move |res| match res {
-                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
-                    Err(_) => Err(io::Error::new(
-                        io::ErrorKind::TimedOut,
-                        format!("exceeded connection timeout {timeout:?}"),
-                    )),
-                },
-            )
+        let connect_with_timeout = |addrs| {
+            tokio::time::timeout(timeout, TcpStream::connect(addrs)).map(move |res| match res {
+                Ok(tcpstream_connect_res) => tcpstream_connect_res,
+                Err(_) => Err(io::Error::new(
+                    io::ErrorKind::TimedOut,
+                    format!("exceeded connection timeout {timeout:?}"),
+                )),
+            })
        };

-        let connect_once = |host, port| {
-            debug!("trying to connect to compute node at {host}:{port}");
-            connect_with_timeout(host, port).and_then(|stream| async {
+        let connect_once = |addrs| {
+            debug!("trying to connect to compute node at {addrs:?}");
+            connect_with_timeout(addrs).and_then(|stream| async {
                let socket_addr = stream.peer_addr()?;
                let socket = socket2::SockRef::from(&stream);
                // Disable Nagle's algorithm to not introduce latency between
@@ -216,7 +215,12 @@ impl ConnCfg {
            Host::Tcp(host) => host.as_str(),
        };

-        match connect_once(host, port).await {
+        let addrs = match self.0.get_host_addr() {
+            Some(addr) => vec![SocketAddr::new(addr, port)],
+            None => lookup_host((host, port)).await?.collect(),
+        };
+
+        match connect_once(&*addrs).await {
            Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)),
            Err(err) => {
                warn!("couldn't connect to compute node at {host}:{port}: {err}");
@@ -277,13 +281,15 @@ impl ConnCfg {
        } = connection;

        tracing::Span::current().record("pid", tracing::field::display(process_id));
+        tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id));
        let stream = stream.into_inner();

        // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
        info!(
            cold_start_info = ctx.cold_start_info().as_str(),
-            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
-            self.0.get_ssl_mode()
+            "connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}",
+            self.0.get_ssl_mode(),
+            ctx.get_proxy_latency(),
        );

        // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -17,7 +17,8 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::error::ErrorKind;
 use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
-    ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
+    ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol,
+    Waiting,
 };
 use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
@@ -346,6 +347,14 @@ impl RequestContext {
        }
    }

+    pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .accumulated()
+    }
+
    pub(crate) fn success(&self) {
        self.0
            .try_lock()
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -1,5 +1,7 @@
 //! Production console backend.

+use std::net::IpAddr;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

@@ -274,11 +276,27 @@ impl NeonControlPlaneClient {
                Some(x) => x,
            };

+            let host_addr = IpAddr::from_str(host).ok();
+
+            let ssl_mode = match &body.server_name {
+                Some(_) => SslMode::Require,
+                None => SslMode::Disable,
+            };
+            let host_name = match body.server_name {
+                Some(host) => host,
+                None => host.to_owned(),
+            };
+
            // Don't set anything but host and port! This config will be cached.
            // We'll set username and such later using the startup message.
            // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new(host.to_owned(), port);
-            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            let mut config = compute::ConnCfg::new(host_name, port);
+
+            if let Some(addr) = host_addr {
+                config.set_host_addr(addr);
+            }
+
+            config.ssl_mode(ssl_mode);

            let node = NodeInfo {
                config,
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -1,5 +1,6 @@
 //! Mock console backend which relies on a user-provided postgres instance.

+use std::net::{IpAddr, Ipv4Addr};
 use std::str::FromStr;
 use std::sync::Arc;

@@ -167,10 +168,22 @@ impl MockControlPlane {
    }

    async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
-        let mut config = compute::ConnCfg::new(
-            self.endpoint.host_str().unwrap_or("localhost").to_owned(),
-            self.endpoint.port().unwrap_or(5432),
-        );
+        let port = self.endpoint.port().unwrap_or(5432);
+        let mut config = match self.endpoint.host_str() {
+            None => {
+                let mut config = compute::ConnCfg::new("localhost".to_string(), port);
+                config.set_host_addr(IpAddr::V4(Ipv4Addr::LOCALHOST));
+                config
+            }
+            Some(host) => {
+                let mut config = compute::ConnCfg::new(host.to_string(), port);
+                if let Ok(addr) = IpAddr::from_str(host) {
+                    config.set_host_addr(addr);
+                }
+                config
+            }
+        };
+
        config.ssl_mode(postgres_client::config::SslMode::Disable);

        let node = NodeInfo {
@@ -179,6 +192,7 @@ impl MockControlPlane {
                endpoint_id: (&EndpointId::from("endpoint")).into(),
                project_id: (&ProjectId::from("project")).into(),
                branch_id: (&BranchId::from("branch")).into(),
+                compute_id: "compute".into(),
                cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
            },
        };
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -2,6 +2,7 @@ use std::fmt::{self, Display};

 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
+use smol_str::SmolStr;

 use crate::auth::IpPattern;
 use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
@@ -239,6 +240,7 @@ pub(crate) struct GetEndpointAccessControl {
 #[derive(Debug, Deserialize)]
 pub(crate) struct WakeCompute {
    pub(crate) address: Box<str>,
+    pub(crate) server_name: Option<String>,
    pub(crate) aux: MetricsAuxInfo,
 }

@@ -312,6 +314,9 @@ pub(crate) struct MetricsAuxInfo {
    pub(crate) endpoint_id: EndpointIdInt,
    pub(crate) project_id: ProjectIdInt,
    pub(crate) branch_id: BranchIdInt,
+    // note: we don't use interned strings for compute IDs.
+    // they churn too quickly and we have no way to clean up interned strings.
+    pub(crate) compute_id: SmolStr,
    #[serde(default)]
    pub(crate) cold_start_info: ColdStartInfo,
 }
@@ -378,6 +383,7 @@ mod tests {
            "endpoint_id": "endpoint",
            "project_id": "project",
            "branch_id": "branch",
+            "compute_id": "compute",
            "cold_start_info": "unknown",
        })
    }
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,9 +1,11 @@
 use std::cell::{Cell, RefCell};
 use std::collections::HashMap;
 use std::hash::BuildHasher;
-use std::{env, io};
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::{array, env, fmt, io};

 use chrono::{DateTime, Utc};
+use indexmap::IndexSet;
 use opentelemetry::trace::TraceContextExt;
 use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
@@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields};
 use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
 use tracing_subscriber::registry::{LookupSpan, SpanRef};
+use try_lock::TryLock;

 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -46,13 +49,13 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
    let otlp_layer = tracing_utils::init_tracing("proxy").await;

    let json_log_layer = if logfmt == LogFormat::Json {
-        Some(JsonLoggingLayer {
-            clock: RealClock,
-            skipped_field_indices: papaya::HashMap::default(),
-            writer: StderrWriter {
+        Some(JsonLoggingLayer::new(
+            RealClock,
+            StderrWriter {
                stderr: std::io::stderr(),
            },
-        })
+            ["request_id", "session_id", "conn_id"],
+        ))
    } else {
        None
    };
@@ -191,13 +194,39 @@ thread_local! {
 }

 /// Implements tracing layer to handle events specific to logging.
-struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
+struct JsonLoggingLayer<C: Clock, W: MakeWriter, const F: usize> {
    clock: C,
    skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    callsite_ids: papaya::HashMap<callsite::Identifier, CallsiteId>,
    writer: W,
+    // We use a const generic and arrays to bypass one heap allocation.
+    extract_fields: IndexSet<&'static str>,
+    _marker: std::marker::PhantomData<[&'static str; F]>,
 }

-impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
+impl<C: Clock, W: MakeWriter, const F: usize> JsonLoggingLayer<C, W, F> {
+    fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self {
+        JsonLoggingLayer {
+            clock,
+            skipped_field_indices: papaya::HashMap::default(),
+            callsite_ids: papaya::HashMap::default(),
+            writer,
+            extract_fields: IndexSet::from_iter(extract_fields),
+            _marker: std::marker::PhantomData,
+        }
+    }
+
+    #[inline]
+    fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId {
+        *self
+            .callsite_ids
+            .pin()
+            .get_or_insert_with(cs, CallsiteId::next)
+    }
+}
+
+impl<S, C: Clock + 'static, W: MakeWriter + 'static, const F: usize> Layer<S>
+    for JsonLoggingLayer<C, W, F>
 where
    S: Subscriber + for<'a> LookupSpan<'a>,
 {
@@ -211,7 +240,14 @@ where
        let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
            if entered.get() {
                let mut formatter = EventFormatter::new();
-                formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                formatter.format::<S, F>(
+                    now,
+                    event,
+                    &ctx,
+                    &self.skipped_field_indices,
+                    &self.callsite_ids,
+                    &self.extract_fields,
+                )?;
                self.writer.make_writer().write_all(formatter.buffer())
            } else {
                entered.set(true);
@@ -219,7 +255,14 @@ where

                EVENT_FORMATTER.with_borrow_mut(move |formatter| {
                    formatter.reset();
-                    formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                    formatter.format::<S, F>(
+                        now,
+                        event,
+                        &ctx,
+                        &self.skipped_field_indices,
+                        &self.callsite_ids,
+                        &self.extract_fields,
+                    )?;
                    self.writer.make_writer().write_all(formatter.buffer())
                })
            }
@@ -243,13 +286,17 @@ where

    /// Registers a SpanFields instance as span extension.
    fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
+        let csid = self.callsite_id(attrs.metadata().callsite());
        let span = ctx.span(id).expect("span must exist");
        let fields = SpanFields::default();
        fields.record_fields(attrs);
        // This could deadlock when there's a panic somewhere in the tracing
        // event handling and a read or write guard is still held. This includes
        // the OTel subscriber.
-        span.extensions_mut().insert(fields);
+        let mut exts = span.extensions_mut();
+
+        exts.insert(fields);
+        exts.insert(csid);
    }

    fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
@@ -265,6 +312,7 @@ where
    /// wins.
    fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
        if !metadata.is_event() {
+            self.callsite_id(metadata.callsite());
            // Must not be never because we wouldn't get trace and span data.
            return Interest::always();
        }
@@ -297,6 +345,26 @@ where
    }
 }

+#[derive(Copy, Clone, Debug, Default)]
+#[repr(transparent)]
+struct CallsiteId(u32);
+
+impl CallsiteId {
+    #[inline]
+    fn next() -> Self {
+        // Start at 1 to reserve 0 for default.
+        static COUNTER: AtomicU32 = AtomicU32::new(1);
+        CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed))
+    }
+}
+
+impl fmt::Display for CallsiteId {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Stores span field values recorded during the spans lifetime.
 #[derive(Default)]
 struct SpanFields {
@@ -448,12 +516,14 @@ impl EventFormatter {
        self.logline_buffer.clear();
    }

-    fn format<S>(
+    fn format<S, const F: usize>(
        &mut self,
        now: DateTime<Utc>,
        event: &Event<'_>,
        ctx: &Context<'_, S>,
        skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+        callsite_ids: &papaya::HashMap<callsite::Identifier, CallsiteId>,
+        extract_fields: &IndexSet<&'static str>,
    ) -> io::Result<()>
    where
        S: Subscriber + for<'a> LookupSpan<'a>,
@@ -485,6 +555,7 @@ impl EventFormatter {
            event.record(&mut message_extractor);
            let mut serializer = message_extractor.into_serializer()?;

+            // Direct message fields.
            let mut fields_present = FieldsPresent(false, skipped_field_indices);
            event.record(&mut fields_present);
            if fields_present.0 {
@@ -494,7 +565,9 @@ impl EventFormatter {
                )?;
            }

+            // TODO: thread-local cache?
            let pid = std::process::id();
+            // Skip adding pid 1 to reduce noise for services running in containers.
            if pid != 1 {
                serializer.serialize_entry("process_id", &pid)?;
            }
@@ -514,6 +587,7 @@ impl EventFormatter {

            serializer.serialize_entry("target", meta.target())?;

+            // Skip adding module if it's the same as target.
            if let Some(module) = meta.module_path() {
                if module != meta.target() {
                    serializer.serialize_entry("module", module)?;
@@ -540,7 +614,16 @@ impl EventFormatter {
                }
            }

-            serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
+            let stack = SerializableSpans {
+                ctx,
+                callsite_ids,
+                fields: ExtractedSpanFields::<'_, F>::new(extract_fields),
+            };
+            serializer.serialize_entry("spans", &stack)?;
+
+            if stack.fields.has_values() {
+                serializer.serialize_entry("extract", &stack.fields)?;
+            }

            serializer.end()
        };
@@ -818,15 +901,20 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
    }
 }

-/// Serializes the span stack from root to leaf (parent of event) enumerated
-/// inside an object where the keys are just the number padded with zeroes
-/// to retain sorting order.
-// The object is necessary because Loki cannot flatten arrays.
-struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
+/// Serializes the span stack from root to leaf (parent of event) as object
+/// with the span names as keys. To prevent collision we append a numberic value
+/// to the name. Also, collects any span fields we're interested in. Last one
+/// wins.
+struct SerializableSpans<'a, 'ctx, Span, const F: usize>
 where
-    Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+{
+    ctx: &'a Context<'ctx, Span>,
+    callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
+    fields: ExtractedSpanFields<'a, F>,
+}

-impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
+impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
 where
    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
 {
@@ -836,9 +924,24 @@ where
    {
        let mut serializer = serializer.serialize_map(None)?;

-        if let Some(leaf_span) = self.0.lookup_current() {
-            for (i, span) in leaf_span.scope().from_root().enumerate() {
-                serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
+        if let Some(leaf_span) = self.ctx.lookup_current() {
+            for span in leaf_span.scope().from_root() {
+                // Append a numeric callsite ID to the span name to keep the name unique
+                // in the JSON object.
+                let cid = self
+                    .callsite_ids
+                    .pin()
+                    .get(&span.metadata().callsite())
+                    .copied()
+                    .unwrap_or_default();
+
+                // Loki turns the # into an underscore during field name concatenation.
+                serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?;
+
+                serializer.serialize_value(&SerializableSpanFields {
+                    span: &span,
+                    fields: &self.fields,
+                })?;
            }
        }

@@ -846,28 +949,79 @@ where
    }
 }

-/// Serializes a single span. Include the span ID, name and its fields as
-/// recorded up to this point.
-struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
-where
-    Span: for<'lookup> LookupSpan<'lookup>;
-
-impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
+/// Serializes the span fields as object.
+struct SerializableSpanFields<'a, 'span, Span, const F: usize>
 where
    Span: for<'lookup> LookupSpan<'lookup>,
 {
-    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    span: &'a SpanRef<'span, Span>,
+    fields: &'a ExtractedSpanFields<'a, F>,
+}
+
+impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
+where
+    Span: for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
-        Ser: serde::ser::Serializer,
+        S: serde::ser::Serializer,
    {
        let mut serializer = serializer.serialize_map(None)?;
-        // TODO: the span ID is probably only useful for debugging tracing.
-        serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
-        serializer.serialize_entry("span_name", self.0.metadata().name())?;

-        let ext = self.0.extensions();
+        let ext = self.span.extensions();
        if let Some(data) = ext.get::<SpanFields>() {
-            for (key, value) in &data.fields.pin() {
+            for (name, value) in &data.fields.pin() {
+                serializer.serialize_entry(name, value)?;
+                // TODO: replace clone with reference, if possible.
+                self.fields.set(name, value.clone());
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+struct ExtractedSpanFields<'a, const F: usize> {
+    names: &'a IndexSet<&'static str>,
+    // TODO: replace TryLock with something local thread and interior mutability.
+    //       serde API doesn't let us use `mut`.
+    values: TryLock<([Option<serde_json::Value>; F], bool)>,
+}
+
+impl<'a, const F: usize> ExtractedSpanFields<'a, F> {
+    fn new(names: &'a IndexSet<&'static str>) -> Self {
+        ExtractedSpanFields {
+            names,
+            values: TryLock::new((array::from_fn(|_| Option::default()), false)),
+        }
+    }
+
+    #[inline]
+    fn set(&self, name: &'static str, value: serde_json::Value) {
+        if let Some((index, _)) = self.names.get_full(name) {
+            let mut fields = self.values.try_lock().expect("thread-local use");
+            fields.0[index] = Some(value);
+            fields.1 = true;
+        }
+    }
+
+    #[inline]
+    fn has_values(&self) -> bool {
+        self.values.try_lock().expect("thread-local use").1
+    }
+}
+
+impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+
+        let values = self.values.try_lock().expect("thread-local use");
+        for (i, value) in values.0.iter().enumerate() {
+            if let Some(value) = value {
+                let key = self.names[i];
                serializer.serialize_entry(key, value)?;
            }
        }
@@ -879,6 +1033,7 @@ where
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
+    use std::marker::PhantomData;
    use std::sync::{Arc, Mutex, MutexGuard};

    use assert_json_diff::assert_json_eq;
@@ -927,14 +1082,17 @@ mod tests {
        let log_layer = JsonLoggingLayer {
            clock: clock.clone(),
            skipped_field_indices: papaya::HashMap::default(),
+            callsite_ids: papaya::HashMap::default(),
            writer: buffer.clone(),
+            extract_fields: IndexSet::from_iter(["x"]),
+            _marker: PhantomData::<[&'static str; 1]>,
        };

        let registry = tracing_subscriber::Registry::default().with(log_layer);

        tracing::subscriber::with_default(registry, || {
-            info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
-                info_span!("span2").in_scope(|| {
+            info_span!("some_span", x = 24).in_scope(|| {
+                info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| {
                    tracing::error!(
                        a = 1,
                        a = 2,
@@ -960,16 +1118,16 @@ mod tests {
                    "a": 3,
                },
                "spans": {
-                    "00":{
-                        "span_id": "0000000000000001",
-                        "span_name": "span1",
-                        "x": 42,
+                    "some_span#1":{
+                        "x": 24,
                    },
-                    "01": {
-                        "span_id": "0000000000000002",
-                        "span_name": "span2",
+                    "some_span#2": {
+                        "x": 42,
                    }
                },
+                "extract": {
+                    "x": 42,
+                },
                "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
                "target": "proxy::logging::tests",
                "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -394,21 +394,31 @@ pub enum RedisMsgKind {
    HDel,
 }

-#[derive(Default)]
-struct Accumulated {
+#[derive(Default, Clone)]
+pub struct LatencyAccumulated {
    cplane: time::Duration,
    client: time::Duration,
    compute: time::Duration,
    retry: time::Duration,
 }

+impl std::fmt::Display for LatencyAccumulated {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "client: {:?}, cplane: {:?}, compute: {:?}, retry: {:?}",
+            self.client, self.cplane, self.compute, self.retry
+        )
+    }
+}
+
 pub struct LatencyTimer {
    // time since the stopwatch was started
    start: time::Instant,
    // time since the stopwatch was stopped
    stop: Option<time::Instant>,
    // accumulated time on the stopwatch
-    accumulated: Accumulated,
+    accumulated: LatencyAccumulated,
    // label data
    protocol: Protocol,
    cold_start_info: ColdStartInfo,
@@ -422,7 +432,7 @@ impl LatencyTimer {
        Self {
            start: time::Instant::now(),
            stop: None,
-            accumulated: Accumulated::default(),
+            accumulated: LatencyAccumulated::default(),
            protocol,
            cold_start_info: ColdStartInfo::Unknown,
            // assume failed unless otherwise specified
@@ -435,7 +445,7 @@ impl LatencyTimer {
        Self {
            start: time::Instant::now(),
            stop: None,
-            accumulated: Accumulated::default(),
+            accumulated: LatencyAccumulated::default(),
            protocol,
            cold_start_info: ColdStartInfo::Unknown,
            // assume failed unless otherwise specified
@@ -465,6 +475,10 @@ impl LatencyTimer {
        // success
        self.outcome = ConnectOutcome::Success;
    }
+
+    pub fn accumulated(&self) -> LatencyAccumulated {
+        self.accumulated.clone()
+    }
 }

 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
@@ -511,7 +525,7 @@ impl Drop for LatencyTimer {
            duration.saturating_sub(accumulated_total).as_secs_f64(),
        );

-        // Exclude client cplane, compue communication from the accumulated time.
+        // Exclude client, cplane, compute communication from the accumulated time.
        let accumulated_total =
            self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
        metric.observe(
@@ -524,7 +538,7 @@ impl Drop for LatencyTimer {
            duration.saturating_sub(accumulated_total).as_secs_f64(),
        );

-        // Exclude client cplane, compue, retry communication from the accumulated time.
+        // Exclude client, cplane, compute, retry communication from the accumulated time.
        let accumulated_total = self.accumulated.client
            + self.accumulated.cplane
            + self.accumulated.compute
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -81,7 +81,10 @@ impl ConnectMechanism for TcpMechanism<'_> {
    type ConnectError = compute::ConnectionError;
    type Error = compute::ConnectionError;

-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        compute_id = tracing::field::Empty
+    ))]
    async fn connect_once(
        &self,
        ctx: &RequestContext,
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -555,6 +555,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
            endpoint_id: (&EndpointId::from("endpoint")).into(),
            project_id: (&ProjectId::from("project")).into(),
            branch_id: (&BranchId::from("branch")).into(),
+            compute_id: "compute".into(),
            cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
        },
    };
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,4 +1,5 @@
 use std::io;
+use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 use std::time::Duration;

@@ -6,11 +7,15 @@ use async_trait::async_trait;
 use ed25519_dalek::SigningKey;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
 use jose_jwk::jose_b64;
+use postgres_client::config::SslMode;
 use rand::rngs::OsRng;
+use rustls::pki_types::{DnsName, ServerName};
 use tokio::net::{TcpStream, lookup_host};
+use tokio_rustls::TlsConnector;
 use tracing::field::display;
 use tracing::{debug, info};

+use super::AsyncRW;
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool};
 use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client};
@@ -190,7 +195,11 @@ impl PoolingBackend {
    // Wake up the destination if needed. Code here is a bit involved because
    // we reuse the code from the usual proxy and we need to prepare few structures
    // that this code expects.
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        compute_id = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
    pub(crate) async fn connect_to_compute(
        &self,
        ctx: &RequestContext,
@@ -229,7 +238,10 @@ impl PoolingBackend {
    }

    // Wake up the destination if needed
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        compute_id = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
    pub(crate) async fn connect_to_local_proxy(
        &self,
        ctx: &RequestContext,
@@ -276,7 +288,10 @@ impl PoolingBackend {
    /// # Panics
    ///
    /// Panics if called with a non-local_proxy backend.
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
    pub(crate) async fn connect_to_local_postgres(
        &self,
        ctx: &RequestContext,
@@ -552,6 +567,10 @@ impl ConnectMechanism for TokioMechanism {
        let (client, connection) = permit.release_result(res)?;

        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record(
+            "compute_id",
+            tracing::field::display(&node_info.aux.compute_id),
+        );
        Ok(poll_client(
            self.pool.clone(),
            ctx,
@@ -587,16 +606,28 @@ impl ConnectMechanism for HyperMechanism {
        node_info: &CachedNodeInfo,
        config: &ComputeConfig,
    ) -> Result<Self::Connection, Self::ConnectError> {
+        let host_addr = node_info.config.get_host_addr();
        let host = node_info.config.get_host();
        let permit = self.locks.get_permit(&host).await?;

        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);

+        let tls = if node_info.config.get_ssl_mode() == SslMode::Disable {
+            None
+        } else {
+            Some(&config.tls)
+        };
+
        let port = node_info.config.get_port();
-        let res = connect_http2(&host, port, config.timeout).await;
+        let res = connect_http2(host_addr, &host, port, config.timeout, tls).await;
        drop(pause);
        let (client, connection) = permit.release_result(res)?;

+        tracing::Span::current().record(
+            "compute_id",
+            tracing::field::display(&node_info.aux.compute_id),
+        );
+
        Ok(poll_http2_client(
            self.pool.clone(),
            ctx,
@@ -612,18 +643,22 @@ impl ConnectMechanism for HyperMechanism {
 }

 async fn connect_http2(
+    host_addr: Option<IpAddr>,
    host: &str,
    port: u16,
    timeout: Duration,
+    tls: Option<&Arc<rustls::ClientConfig>>,
 ) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> {
-    // assumption: host is an ip address so this should not actually perform any requests.
-    // todo: add that assumption as a guarantee in the control-plane API.
-    let mut addrs = lookup_host((host, port))
-        .await
-        .map_err(LocalProxyConnError::Io)?;
-
+    let addrs = match host_addr {
+        Some(addr) => vec![SocketAddr::new(addr, port)],
+        None => lookup_host((host, port))
+            .await
+            .map_err(LocalProxyConnError::Io)?
+            .collect(),
+    };
    let mut last_err = None;

+    let mut addrs = addrs.into_iter();
    let stream = loop {
        let Some(addr) = addrs.next() else {
            return Err(last_err.unwrap_or_else(|| {
@@ -651,6 +686,20 @@ async fn connect_http2(
        }
    };

+    let stream = if let Some(tls) = tls {
+        let host = DnsName::try_from(host)
+            .map_err(io::Error::other)
+            .map_err(LocalProxyConnError::Io)?
+            .to_owned();
+        let stream = TlsConnector::from(tls.clone())
+            .connect(ServerName::DnsName(host), stream)
+            .await
+            .map_err(LocalProxyConnError::Io)?;
+        Box::pin(stream) as AsyncRW
+    } else {
+        Box::pin(stream) as AsyncRW
+    };
+
    let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())
        .timer(TokioTimer::new())
        .keep_alive_interval(Duration::from_secs(20))
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -221,6 +221,7 @@ mod tests {
                endpoint_id: (&EndpointId::from("endpoint")).into(),
                project_id: (&ProjectId::from("project")).into(),
                branch_id: (&BranchId::from("branch")).into(),
+                compute_id: "compute".into(),
                cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
            },
            conn_id: uuid::Uuid::new_v4(),
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -6,9 +6,9 @@ use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use smol_str::ToSmolStr;
-use tokio::net::TcpStream;
 use tracing::{Instrument, debug, error, info, info_span};

+use super::AsyncRW;
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
    ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry,
@@ -22,8 +22,7 @@ use crate::types::EndpointCacheKey;
 use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS};

 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
-pub(crate) type Connect =
-    http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
+pub(crate) type Connect = http2::Connection<TokioIo<AsyncRW>, hyper::body::Incoming, TokioExecutor>;

 #[derive(Clone)]
 pub(crate) struct ClientDataHttp();
--- a/proxy/src/tls/client_config.rs
+++ b/proxy/src/tls/client_config.rs
@@ -1,17 +1,49 @@
+use std::env;
+use std::io::Cursor;
+use std::path::PathBuf;
 use std::sync::Arc;

-use anyhow::bail;
+use anyhow::{Context, bail};
 use rustls::crypto::ring;

-pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+/// We use an internal certificate authority when establishing a TLS connection with compute.
+fn load_internal_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> {
+    let Some(ca_file) = env::var_os("NEON_INTERNAL_CA_FILE") else {
+        return Ok(());
+    };
+    let ca_file = PathBuf::from(ca_file);
+
+    let ca = std::fs::read(&ca_file)
+        .with_context(|| format!("could not read CA from {}", ca_file.display()))?;
+
+    for cert in rustls_pemfile::certs(&mut Cursor::new(&*ca)) {
+        store
+            .add(cert.context("could not parse internal CA certificate")?)
+            .context("could not parse internal CA certificate")?;
+    }
+
+    Ok(())
+}
+
+/// For console redirect proxy, we need to establish a connection to compute via pg-sni-router.
+/// pg-sni-router needs TLS and uses a Let's Encrypt signed certificate, so we
+/// load certificates from our native store.
+fn load_native_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> {
    let der_certs = rustls_native_certs::load_native_certs();

    if !der_certs.errors.is_empty() {
        bail!("could not parse certificates: {:?}", der_certs.errors);
    }

-    let mut store = rustls::RootCertStore::empty();
    store.add_parsable_certificates(der_certs.certs);
+
+    Ok(())
+}
+
+fn load_compute_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let mut store = rustls::RootCertStore::empty();
+    load_native_certs(&mut store)?;
+    load_internal_certs(&mut store)?;
    Ok(Arc::new(store))
 }

@@ -22,7 +54,7 @@ pub fn compute_client_config_with_root_certs() -> anyhow::Result<rustls::ClientC
        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
            .with_safe_default_protocol_versions()
            .expect("ring should support the default protocol versions")
-            .with_root_certificates(load_certs()?)
+            .with_root_certificates(load_compute_certs()?)
            .with_no_client_auth(),
    )
 }
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -184,6 +184,16 @@ impl InterpretedWalReaderState {
                        to: *current_position,
                    }
                } else {
+                    // Edge case: The new shard is at the same current position as
+                    // the reader. Note that the current position is WAL record aligned,
+                    // so the reader might have done some partial reads and updated the
+                    // batch start. If that's the case, adjust the batch start to match
+                    // starting position of the new shard. It can lead to some shards
+                    // seeing overlaps, but in that case the actual record LSNs are checked
+                    // which should be fine based on the filtering logic.
+                    if let Some(start) = current_batch_wal_start {
+                        *start = std::cmp::min(*start, new_shard_start_pos);
+                    }
                    CurrentPositionUpdate::NotReset(*current_position)
                }
            }
@@ -287,7 +297,13 @@ impl InterpretedWalReader {
                reader
                    .run_impl(start_pos)
                    .await
-                    .inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
+                    .inspect_err(|err| match err {
+                        // TODO: we may want to differentiate these errors further.
+                        InterpretedWalReaderError::Decode(_) => {
+                            critical!("failed to decode WAL record: {err:?}");
+                        }
+                        err => error!("failed to read WAL record: {err}"),
+                    })
            }
            .instrument(info_span!("interpreted wal reader")),
        );
@@ -347,10 +363,12 @@ impl InterpretedWalReader {
            metric.dec();
        }

-        if let Err(err) = self.run_impl(start_pos).await {
-            critical!("failed to read WAL record: {err:?}");
-        } else {
-            info!("interpreted wal reader exiting");
+        match self.run_impl(start_pos).await {
+            Err(err @ InterpretedWalReaderError::Decode(_)) => {
+                critical!("failed to decode WAL record: {err:?}");
+            }
+            Err(err) => error!("failed to read WAL record: {err}"),
+            Ok(()) => info!("interpreted wal reader exiting"),
        }

        Err(CopyStreamHandlerEnd::Other(anyhow!(
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -415,6 +415,9 @@ impl From<TimelineError> for ApiError {
    }
 }

+/// We run remote deletion in a background task, this is how it sends its results back.
+type RemoteDeletionReceiver = tokio::sync::watch::Receiver<Option<anyhow::Result<()>>>;
+
 /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline.
 /// It also holds SharedState and provides mutually exclusive access to it.
 pub struct Timeline {
@@ -446,6 +449,8 @@ pub struct Timeline {
    manager_ctl: ManagerCtl,
    conf: Arc<SafeKeeperConf>,

+    remote_deletion: std::sync::Mutex<Option<RemoteDeletionReceiver>>,
+
    /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
    /// this gate, you must respect [`Timeline::cancel`]
    pub(crate) gate: Gate,
@@ -494,6 +499,7 @@ impl Timeline {
            walreceivers,
            gate: Default::default(),
            cancel: CancellationToken::default(),
+            remote_deletion: std::sync::Mutex::new(None),
            manager_ctl: ManagerCtl::new(),
            conf,
            broker_active: AtomicBool::new(false),
@@ -598,15 +604,95 @@ impl Timeline {
        shared_state.sk.close_wal_store();

        if !only_local && self.conf.is_wal_backup_enabled() {
-            // Note: we concurrently delete remote storage data from multiple
-            // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
-            // do some retries anyway.
-            wal_backup::delete_timeline(&self.ttid).await?;
+            self.remote_delete().await?;
        }
        let dir_existed = delete_dir(&self.timeline_dir).await?;
        Ok(dir_existed)
    }

+    /// Delete timeline content from remote storage.  If the returned future is dropped,
+    /// deletion will continue in the background.
+    ///
+    /// This function ordinarily spawns a task and stashes a result receiver into [`Self::remote_deletion`].  If
+    /// deletion is already happening, it may simply wait for an existing task's result.
+    ///
+    /// Note: we concurrently delete remote storage data from multiple
+    /// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
+    /// do some retries anyway.
+    async fn remote_delete(&self) -> Result<()> {
+        // We will start a background task to do the deletion, so that it proceeds even if our
+        // API request is dropped.  Future requests will see the existing deletion task and wait
+        // for it to complete.
+        let mut result_rx = {
+            let mut remote_deletion_state = self.remote_deletion.lock().unwrap();
+            let result_rx = if let Some(result_rx) = remote_deletion_state.as_ref() {
+                if let Some(result) = result_rx.borrow().as_ref() {
+                    if let Err(e) = result {
+                        // A previous remote deletion failed: we will start a new one
+                        tracing::error!("remote deletion failed, will retry ({e})");
+                        None
+                    } else {
+                        // A previous remote deletion call already succeeded
+                        return Ok(());
+                    }
+                } else {
+                    // Remote deletion is still in flight
+                    Some(result_rx.clone())
+                }
+            } else {
+                // Remote deletion was not attempted yet, start it now.
+                None
+            };
+
+            match result_rx {
+                Some(result_rx) => result_rx,
+                None => self.start_remote_delete(&mut remote_deletion_state),
+            }
+        };
+
+        // Wait for a result
+        let Ok(result) = result_rx.wait_for(|v| v.is_some()).await else {
+            // Unexpected: sender should always send a result before dropping the channel, even if it has an error
+            return Err(anyhow::anyhow!(
+                "remote deletion task future was dropped without sending a result"
+            ));
+        };
+
+        result
+            .as_ref()
+            .expect("We did a wait_for on this being Some above")
+            .as_ref()
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!("remote deletion failed: {e}"))
+    }
+
+    /// Spawn background task to do remote deletion, return a receiver for its outcome
+    fn start_remote_delete(
+        &self,
+        guard: &mut std::sync::MutexGuard<Option<RemoteDeletionReceiver>>,
+    ) -> RemoteDeletionReceiver {
+        tracing::info!("starting remote deletion");
+        let (result_tx, result_rx) = tokio::sync::watch::channel(None);
+        let ttid = self.ttid;
+        tokio::task::spawn(
+            async move {
+                let r = wal_backup::delete_timeline(&ttid).await;
+                if let Err(e) = &r {
+                    // Log error here in case nobody ever listens for our result (e.g. dropped API request)
+                    tracing::error!("remote deletion failed: {e}");
+                }
+
+                // Ignore send results: it's legal for the Timeline to give up waiting for us.
+                let _ = result_tx.send(Some(r));
+            }
+            .instrument(info_span!("remote_delete", timeline = %self.ttid)),
+        );
+
+        **guard = Some(result_rx.clone());
+
+        result_rx
+    }
+
    /// Returns if timeline is cancelled.
    pub fn is_cancelled(&self) -> bool {
        self.cancel.is_cancelled()
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -21,9 +21,9 @@ use tokio::sync::{OnceCell, watch};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::backoff;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
+use utils::{backoff, pausable_failpoint};

 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
 use crate::timeline::WalResidentTimeline;
@@ -564,6 +564,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    // We don't currently have http requests timeout cancellation, but if/once
    // we have listing should get streaming interface to make progress.

+    pausable_failpoint!("sk-delete-timeline-remote-pause");
+
+    fail::fail_point!("sk-delete-timeline-remote", |_| {
+        Err(anyhow::anyhow!("failpoint: sk-delete-timeline-remote"))
+    });
+
    let cancel = CancellationToken::new(); // not really used
    backoff::retry(
        || async {
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -21,6 +21,7 @@ clap.workspace = true
 cron.workspace = true
 fail.workspace = true
 futures.workspace = true
+governor.workspace = true
 hex.workspace = true
 hyper0.workspace = true
 humantime.workspace = true
--- a/Show More
+++ b/Show More